mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 22:59:57 +00:00
Compare commits
28 Commits
python-v0.
...
rmeng/pool
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cc7a503faa | ||
|
|
2ded17452b | ||
|
|
dfd9d2ac99 | ||
|
|
162880140e | ||
|
|
99d9ced6d5 | ||
|
|
96933d7df8 | ||
|
|
d369233b3d | ||
|
|
43a670ed4b | ||
|
|
cb9a00a28d | ||
|
|
72af977a73 | ||
|
|
7cecb71df0 | ||
|
|
285071e5c8 | ||
|
|
114866fbcf | ||
|
|
5387c0e243 | ||
|
|
53d1535de1 | ||
|
|
b2f88f0b29 | ||
|
|
f2e3989831 | ||
|
|
83ae52938a | ||
|
|
267aa83bf8 | ||
|
|
cc72050206 | ||
|
|
72543c8b9d | ||
|
|
97d6210c33 | ||
|
|
a3d0c27b0a | ||
|
|
b23d8abcdd | ||
|
|
e3ea5cf9b9 | ||
|
|
4f8b086175 | ||
|
|
72330fb759 | ||
|
|
e3b2c5f438 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.13.0-beta.2"
|
current_version = "0.13.1-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
@@ -87,6 +87,16 @@ glob = "node/package.json"
|
|||||||
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
||||||
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
glob = "node/package.json"
|
||||||
|
replace = "\"@lancedb/vectordb-linux-arm64-musl\": \"{new_version}\""
|
||||||
|
search = "\"@lancedb/vectordb-linux-arm64-musl\": \"{current_version}\""
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
glob = "node/package.json"
|
||||||
|
replace = "\"@lancedb/vectordb-linux-x64-musl\": \"{new_version}\""
|
||||||
|
search = "\"@lancedb/vectordb-linux-x64-musl\": \"{current_version}\""
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
[[tool.bumpversion.files]]
|
||||||
glob = "node/package.json"
|
glob = "node/package.json"
|
||||||
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
||||||
|
|||||||
@@ -31,6 +31,9 @@ rustflags = [
|
|||||||
[target.x86_64-unknown-linux-gnu]
|
[target.x86_64-unknown-linux-gnu]
|
||||||
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
|
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
|
||||||
|
|
||||||
|
[target.x86_64-unknown-linux-musl]
|
||||||
|
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=-crt-static,+avx2,+fma,+f16c"]
|
||||||
|
|
||||||
[target.aarch64-apple-darwin]
|
[target.aarch64-apple-darwin]
|
||||||
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]
|
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]
|
||||||
|
|
||||||
|
|||||||
120
.github/workflows/npm-publish.yml
vendored
120
.github/workflows/npm-publish.yml
vendored
@@ -101,7 +101,7 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
|
|
||||||
node-linux:
|
node-linux-gnu:
|
||||||
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
@@ -137,11 +137,63 @@ jobs:
|
|||||||
- name: Upload Linux Artifacts
|
- name: Upload Linux Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: node-native-linux-${{ matrix.config.arch }}
|
name: node-native-linux-${{ matrix.config.arch }}-gnu
|
||||||
path: |
|
path: |
|
||||||
node/dist/lancedb-vectordb-linux*.tgz
|
node/dist/lancedb-vectordb-linux*.tgz
|
||||||
|
|
||||||
nodejs-linux:
|
node-linux-musl:
|
||||||
|
name: vectordb (${{ matrix.config.arch}}-unknown-linux-musl)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: alpine:edge
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- arch: x86_64
|
||||||
|
- arch: aarch64
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install common dependencies
|
||||||
|
run: |
|
||||||
|
apk add protobuf-dev curl clang mold grep npm bash
|
||||||
|
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
|
||||||
|
echo "source $HOME/.cargo/env" >> saved_env
|
||||||
|
echo "export CC=clang" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
|
||||||
|
- name: Configure aarch64 build
|
||||||
|
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||||
|
run: |
|
||||||
|
source "$HOME/.cargo/env"
|
||||||
|
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
|
||||||
|
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
|
||||||
|
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
|
||||||
|
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
|
||||||
|
curl -sSf $apk_url > apk_list
|
||||||
|
for pkg in gcc libgcc musl; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
|
||||||
|
mkdir -p $sysroot_lib
|
||||||
|
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
|
||||||
|
cp usr/lib/libgcc_s.so.1 $sysroot_lib
|
||||||
|
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
|
||||||
|
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
|
||||||
|
echo '!<arch>' > $sysroot_lib/libdl.a
|
||||||
|
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
|
||||||
|
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=apple-m1 -Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
|
||||||
|
- name: Build Linux Artifacts
|
||||||
|
run: |
|
||||||
|
source ./saved_env
|
||||||
|
bash ci/manylinux_node/build_vectordb.sh ${{ matrix.config.arch }}
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: node-native-linux-${{ matrix.config.arch }}-musl
|
||||||
|
path: |
|
||||||
|
node/dist/lancedb-vectordb-linux*.tgz
|
||||||
|
|
||||||
|
nodejs-linux-gnu:
|
||||||
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
|
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
@@ -178,7 +230,7 @@ jobs:
|
|||||||
- name: Upload Linux Artifacts
|
- name: Upload Linux Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: nodejs-native-linux-${{ matrix.config.arch }}
|
name: nodejs-native-linux-${{ matrix.config.arch }}-gnu
|
||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
# The generic files are the same in all distros so we just pick
|
# The generic files are the same in all distros so we just pick
|
||||||
@@ -192,6 +244,62 @@ jobs:
|
|||||||
nodejs/dist/*
|
nodejs/dist/*
|
||||||
!nodejs/dist/*.node
|
!nodejs/dist/*.node
|
||||||
|
|
||||||
|
nodejs-linux-musl:
|
||||||
|
name: lancedb (${{ matrix.config.arch}}-unknown-linux-musl
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: alpine:edge
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- arch: x86_64
|
||||||
|
- arch: aarch64
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install common dependencies
|
||||||
|
run: |
|
||||||
|
apk add protobuf-dev curl clang mold grep npm bash openssl-dev openssl-libs-static
|
||||||
|
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
|
||||||
|
echo "source $HOME/.cargo/env" >> saved_env
|
||||||
|
echo "export CC=clang" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
|
||||||
|
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=/usr/include" >> saved_env
|
||||||
|
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=/usr/lib" >> saved_env
|
||||||
|
- name: Configure aarch64 build
|
||||||
|
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||||
|
run: |
|
||||||
|
source "$HOME/.cargo/env"
|
||||||
|
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
|
||||||
|
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
|
||||||
|
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
|
||||||
|
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
|
||||||
|
curl -sSf $apk_url > apk_list
|
||||||
|
for pkg in gcc libgcc musl openssl-dev openssl-libs-static; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
|
||||||
|
mkdir -p $sysroot_lib
|
||||||
|
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
|
||||||
|
cp usr/lib/libgcc_s.so.1 $sysroot_lib
|
||||||
|
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
|
||||||
|
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
|
||||||
|
echo '!<arch>' > $sysroot_lib/libdl.a
|
||||||
|
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
|
||||||
|
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
|
||||||
|
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=$(realpath usr/include)" >> saved_env
|
||||||
|
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=$(realpath usr/lib)" >> saved_env
|
||||||
|
- name: Build Linux Artifacts
|
||||||
|
run: |
|
||||||
|
source ./saved_env
|
||||||
|
bash ci/manylinux_node/build_lancedb.sh ${{ matrix.config.arch }}
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: nodejs-native-linux-${{ matrix.config.arch }}-musl
|
||||||
|
path: |
|
||||||
|
nodejs/dist/*.node
|
||||||
|
|
||||||
node-windows:
|
node-windows:
|
||||||
name: vectordb ${{ matrix.target }}
|
name: vectordb ${{ matrix.target }}
|
||||||
runs-on: windows-2022
|
runs-on: windows-2022
|
||||||
@@ -460,7 +568,7 @@ jobs:
|
|||||||
|
|
||||||
release:
|
release:
|
||||||
name: vectordb NPM Publish
|
name: vectordb NPM Publish
|
||||||
needs: [node, node-macos, node-linux, node-windows, node-windows-arm64]
|
needs: [node, node-macos, node-linux-gnu, node-linux-musl, node-windows]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
@@ -500,7 +608,7 @@ jobs:
|
|||||||
|
|
||||||
release-nodejs:
|
release-nodejs:
|
||||||
name: lancedb NPM Publish
|
name: lancedb NPM Publish
|
||||||
needs: [nodejs-macos, nodejs-linux, nodejs-windows, nodejs-windows-arm64]
|
needs: [nodejs-macos, nodejs-linux-gnu, nodejs-linux-musl, nodejs-windows]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
|||||||
18
Cargo.toml
18
Cargo.toml
@@ -18,18 +18,18 @@ repository = "https://github.com/lancedb/lancedb"
|
|||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
keywords = ["lancedb", "lance", "database", "vector", "search"]
|
keywords = ["lancedb", "lance", "database", "vector", "search"]
|
||||||
categories = ["database-implementations"]
|
categories = ["database-implementations"]
|
||||||
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.19.2", "features" = [
|
lance = { "version" = "=0.20.0", "features" = [
|
||||||
"dynamodb",
|
"dynamodb",
|
||||||
]}
|
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-index = "=0.19.2"
|
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-linalg = "=0.19.2"
|
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-table = "=0.19.2"
|
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-testing = "=0.19.2"
|
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-datafusion = "=0.19.2"
|
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-encoding = "=0.19.2"
|
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "52.2", optional = false }
|
arrow = { version = "52.2", optional = false }
|
||||||
arrow-array = "52.2"
|
arrow-array = "52.2"
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ fi
|
|||||||
export OPENSSL_STATIC=1
|
export OPENSSL_STATIC=1
|
||||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||||
|
|
||||||
source $HOME/.bashrc
|
#Alpine doesn't have .bashrc
|
||||||
|
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||||
|
|
||||||
cd nodejs
|
cd nodejs
|
||||||
npm ci
|
npm ci
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ fi
|
|||||||
export OPENSSL_STATIC=1
|
export OPENSSL_STATIC=1
|
||||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||||
|
|
||||||
source $HOME/.bashrc
|
#Alpine doesn't have .bashrc
|
||||||
|
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||||
|
|
||||||
cd node
|
cd node
|
||||||
npm ci
|
npm ci
|
||||||
|
|||||||
@@ -138,6 +138,7 @@ nav:
|
|||||||
- Jina Reranker: reranking/jina.md
|
- Jina Reranker: reranking/jina.md
|
||||||
- OpenAI Reranker: reranking/openai.md
|
- OpenAI Reranker: reranking/openai.md
|
||||||
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
||||||
|
- Voyage AI Rerankers: reranking/voyageai.md
|
||||||
- Building Custom Rerankers: reranking/custom_reranker.md
|
- Building Custom Rerankers: reranking/custom_reranker.md
|
||||||
- Example: notebooks/lancedb_reranking.ipynb
|
- Example: notebooks/lancedb_reranking.ipynb
|
||||||
- Filtering: sql.md
|
- Filtering: sql.md
|
||||||
@@ -165,6 +166,7 @@ nav:
|
|||||||
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
||||||
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
||||||
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
||||||
|
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
|
||||||
- Multimodal Embedding Functions:
|
- Multimodal Embedding Functions:
|
||||||
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
||||||
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
||||||
|
|||||||
21
docs/package-lock.json
generated
21
docs/package-lock.json
generated
@@ -19,7 +19,7 @@
|
|||||||
},
|
},
|
||||||
"../node": {
|
"../node": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.4.6",
|
"version": "0.12.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -31,9 +31,7 @@
|
|||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
|
||||||
"@neon-rs/load": "^0.0.74",
|
"@neon-rs/load": "^0.0.74",
|
||||||
"apache-arrow": "^14.0.2",
|
|
||||||
"axios": "^1.4.0"
|
"axios": "^1.4.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
@@ -46,6 +44,7 @@
|
|||||||
"@types/temp": "^0.9.1",
|
"@types/temp": "^0.9.1",
|
||||||
"@types/uuid": "^9.0.3",
|
"@types/uuid": "^9.0.3",
|
||||||
"@typescript-eslint/eslint-plugin": "^5.59.1",
|
"@typescript-eslint/eslint-plugin": "^5.59.1",
|
||||||
|
"apache-arrow-old": "npm:apache-arrow@13.0.0",
|
||||||
"cargo-cp-artifact": "^0.1",
|
"cargo-cp-artifact": "^0.1",
|
||||||
"chai": "^4.3.7",
|
"chai": "^4.3.7",
|
||||||
"chai-as-promised": "^7.1.1",
|
"chai-as-promised": "^7.1.1",
|
||||||
@@ -62,15 +61,19 @@
|
|||||||
"ts-node-dev": "^2.0.0",
|
"ts-node-dev": "^2.0.0",
|
||||||
"typedoc": "^0.24.7",
|
"typedoc": "^0.24.7",
|
||||||
"typedoc-plugin-markdown": "^3.15.3",
|
"typedoc-plugin-markdown": "^3.15.3",
|
||||||
"typescript": "*",
|
"typescript": "^5.1.0",
|
||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.4.6",
|
"@lancedb/vectordb-darwin-arm64": "0.12.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.4.6",
|
"@lancedb/vectordb-darwin-x64": "0.12.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.4.6",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.12.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.4.6",
|
"@lancedb/vectordb-linux-x64-gnu": "0.12.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.4.6"
|
"@lancedb/vectordb-win32-x64-msvc": "0.12.0"
|
||||||
|
},
|
||||||
|
"peerDependencies": {
|
||||||
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
|
"apache-arrow": "^14.0.2"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"../node/node_modules/apache-arrow": {
|
"../node/node_modules/apache-arrow": {
|
||||||
|
|||||||
@@ -277,7 +277,15 @@ Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` t
|
|||||||
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
||||||
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
||||||
|
|
||||||
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because
|
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. The number should be a factor of the vector dimension. Because
|
||||||
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
||||||
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
|
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
||||||
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
|
||||||
|
!!! note
|
||||||
|
if `num_sub_vectors` is set to be greater than the vector dimension, you will see errors like `attempt to divide by zero`
|
||||||
|
|
||||||
|
### How to choose `m` and `ef_construction` for `IVF_HNSW_*` index?
|
||||||
|
|
||||||
|
`m` determines the number of connections a new node establishes with its closest neighbors upon entering the graph. Typically, `m` falls within the range of 5 to 48. Lower `m` values are suitable for low-dimensional data or scenarios where recall is less critical. Conversely, higher `m` values are beneficial for high-dimensional data or when high recall is required. In essence, a larger `m` results in a denser graph with increased connectivity, but at the expense of higher memory consumption.
|
||||||
|
|
||||||
|
`ef_construction` balances build speed and accuracy. Higher values increase accuracy but slow down the build process. A typical range is 150 to 300. For good search results, a minimum value of 100 is recommended. In most cases, setting this value above 500 offers no additional benefit. Ensure that `ef_construction` is always set to a value equal to or greater than `ef` in the search phase
|
||||||
@@ -57,6 +57,13 @@ Then the greedy search routine operates as follows:
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
There are three key parameters to set when constructing an HNSW index:
|
||||||
|
|
||||||
|
* `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
|
||||||
|
* `m`: The number of neighbors to select for each vector in the HNSW graph.
|
||||||
|
* `ef_construction`: The number of candidates to evaluate during the construction of the HNSW graph.
|
||||||
|
|
||||||
|
|
||||||
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
|
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
|
||||||
|
|
||||||
### Construct index
|
### Construct index
|
||||||
|
|||||||
@@ -58,8 +58,10 @@ In Python, the index can be created as follows:
|
|||||||
# Make sure you have enough data in the table for an effective training step
|
# Make sure you have enough data in the table for an effective training step
|
||||||
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
|
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
|
||||||
```
|
```
|
||||||
|
!!! note
|
||||||
|
`num_partitions`=256 and `num_sub_vectors`=96 does not work for every dataset. Those values needs to be adjusted for your particular dataset.
|
||||||
|
|
||||||
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See the [FAQs](#faq) below for best practices on choosing these parameters.
|
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See [here](../ann_indexes.md/#how-to-choose-num_partitions-and-num_sub_vectors-for-ivf_pq-index) for best practices on choosing these parameters.
|
||||||
|
|
||||||
|
|
||||||
### Query the index
|
### Query the index
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ Supported parameters (to be passed in `create` method) are:
|
|||||||
|
|
||||||
| Parameter | Type | Default Value | Description |
|
| Parameter | Type | Default Value | Description |
|
||||||
|---|---|--------|---------|
|
|---|---|--------|---------|
|
||||||
| `name` | `str` | `"voyage-3"` | The model ID of the model to use. Supported base models for Text Embeddings: voyage-3, voyage-3-lite, voyage-finance-2, voyage-multilingual-2, voyage-law-2, voyage-code-2 |
|
| `name` | `str` | `None` | The model ID of the model to use. Supported base models for Text Embeddings: voyage-3, voyage-3-lite, voyage-finance-2, voyage-multilingual-2, voyage-law-2, voyage-code-2 |
|
||||||
| `input_type` | `str` | `None` | Type of the input text. Default to None. Other options: query, document. |
|
| `input_type` | `str` | `None` | Type of the input text. Default to None. Other options: query, document. |
|
||||||
| `truncation` | `bool` | `True` | Whether to truncate the input texts to fit within the context length. |
|
| `truncation` | `bool` | `True` | Whether to truncate the input texts to fit within the context length. |
|
||||||
|
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ These functions are registered by default to handle text embeddings.
|
|||||||
| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="Jina Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/jina_embedding.md) |
|
| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="Jina Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/jina_embedding.md) |
|
||||||
| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/aws_bedrock.png" alt="AWS Bedrock Icon" width="120" height="35">](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) |
|
| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/aws_bedrock.png" alt="AWS Bedrock Icon" width="120" height="35">](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) |
|
||||||
| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | 💡 Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/watsonx.png" alt="Watsonx Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) |
|
| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | 💡 Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/watsonx.png" alt="Watsonx Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) |
|
||||||
|
| [**VoyageAI Embeddings**](available_embedding_models/text_embedding_functions/voyageai_embedding.md "voyageai") | 🌕 Voyage AI provides cutting-edge embedding and rerankers. This will help you get started with **VoyageAI** embedding models using LanceDB. Using voyageai API requires voyageai package. Install it via `pip`. | [<img src="https://www.voyageai.com/logo.svg" alt="VoyageAI Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/voyageai_embedding.md) |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -66,6 +67,7 @@ These functions are registered by default to handle text embeddings.
|
|||||||
[jina-key]: "jina"
|
[jina-key]: "jina"
|
||||||
[aws-key]: "bedrock-text"
|
[aws-key]: "bedrock-text"
|
||||||
[watsonx-key]: "watsonx"
|
[watsonx-key]: "watsonx"
|
||||||
|
[voyageai-key]: "voyageai"
|
||||||
|
|
||||||
|
|
||||||
## Multi-modal Embedding Functions🖼️
|
## Multi-modal Embedding Functions🖼️
|
||||||
|
|||||||
@@ -114,12 +114,45 @@ table.create_fts_index("text",
|
|||||||
|
|
||||||
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
|
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
|
||||||
|
|
||||||
This can be invoked via the familiar `where` syntax:
|
This can be invoked via the familiar `where` syntax.
|
||||||
|
|
||||||
|
With pre-filtering:
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
table.search("puppy").limit(10).where("meta='foo'").to_list()
|
table.search("puppy").limit(10).where("meta='foo'", prefilte=True).to_list()
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "TypeScript"
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
await tbl
|
||||||
|
.search("puppy")
|
||||||
|
.select(["id", "doc"])
|
||||||
|
.limit(10)
|
||||||
|
.where("meta='foo'")
|
||||||
|
.prefilter(true)
|
||||||
|
.toArray();
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Rust"
|
||||||
|
|
||||||
|
```rust
|
||||||
|
table
|
||||||
|
.query()
|
||||||
|
.full_text_search(FullTextSearchQuery::new("puppy".to_owned()))
|
||||||
|
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
||||||
|
.limit(10)
|
||||||
|
.only_if("meta='foo'")
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
```
|
||||||
|
|
||||||
|
With post-filtering:
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.search("puppy").limit(10).where("meta='foo'", prefilte=False).to_list()
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "TypeScript"
|
=== "TypeScript"
|
||||||
@@ -130,6 +163,7 @@ This can be invoked via the familiar `where` syntax:
|
|||||||
.select(["id", "doc"])
|
.select(["id", "doc"])
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.where("meta='foo'")
|
.where("meta='foo'")
|
||||||
|
.prefilter(false)
|
||||||
.toArray();
|
.toArray();
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -140,6 +174,7 @@ This can be invoked via the familiar `where` syntax:
|
|||||||
.query()
|
.query()
|
||||||
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
|
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
|
||||||
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
||||||
|
.postfilter()
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.only_if("meta='foo'")
|
.only_if("meta='foo'")
|
||||||
.execute()
|
.execute()
|
||||||
@@ -160,3 +195,35 @@ To search for a phrase, the index must be created with `with_position=True`:
|
|||||||
table.create_fts_index("text", use_tantivy=False, with_position=True)
|
table.create_fts_index("text", use_tantivy=False, with_position=True)
|
||||||
```
|
```
|
||||||
This will allow you to search for phrases, but it will also significantly increase the index size and indexing time.
|
This will allow you to search for phrases, but it will also significantly increase the index size and indexing time.
|
||||||
|
|
||||||
|
|
||||||
|
## Incremental indexing
|
||||||
|
|
||||||
|
LanceDB supports incremental indexing, which means you can add new records to the table without reindexing the entire table.
|
||||||
|
|
||||||
|
This can make the query more efficient, especially when the table is large and the new records are relatively small.
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.add([{"vector": [3.1, 4.1], "text": "Frodo was a happy puppy"}])
|
||||||
|
table.optimize()
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "TypeScript"
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
await tbl.add([{ vector: [3.1, 4.1], text: "Frodo was a happy puppy" }]);
|
||||||
|
await tbl.optimize();
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Rust"
|
||||||
|
|
||||||
|
```rust
|
||||||
|
let more_data: Box<dyn RecordBatchReader + Send> = create_some_records()?;
|
||||||
|
tbl.add(more_data).execute().await?;
|
||||||
|
tbl.optimize(OptimizeAction::All).execute().await?;
|
||||||
|
```
|
||||||
|
!!! note
|
||||||
|
|
||||||
|
New data added after creating the FTS index will appear in search results while incremental index is still progress, but with increased latency due to a flat search on the unindexed portion. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
|
||||||
@@ -153,9 +153,7 @@ table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=
|
|||||||
|
|
||||||
## Current limitations
|
## Current limitations
|
||||||
|
|
||||||
1. Currently we do not yet support incremental writes.
|
1. New data added after creating the FTS index will appear in search results, but with increased latency due to a flat search on the unindexed portion. Re-indexing with `create_fts_index` will reduce latency. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
|
||||||
If you add data after FTS index creation, it won't be reflected
|
|
||||||
in search results until you do a full reindex.
|
|
||||||
|
|
||||||
2. We currently only support local filesystem paths for the FTS index.
|
2. We currently only support local filesystem paths for the FTS index.
|
||||||
This is a tantivy limitation. We've implemented an object store plugin
|
This is a tantivy limitation. We've implemented an object store plugin
|
||||||
|
|||||||
@@ -274,7 +274,7 @@ table = db.create_table(table_name, schema=Content)
|
|||||||
|
|
||||||
Sometimes your data model may contain nested objects.
|
Sometimes your data model may contain nested objects.
|
||||||
For example, you may want to store the document string
|
For example, you may want to store the document string
|
||||||
and the document soure name as a nested Document object:
|
and the document source name as a nested Document object:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class Document(BaseModel):
|
class Document(BaseModel):
|
||||||
@@ -466,7 +466,7 @@ You can create an empty table for scenarios where you want to add data to the ta
|
|||||||
|
|
||||||
## Adding to a table
|
## Adding to a table
|
||||||
|
|
||||||
After a table has been created, you can always add more data to it usind the `add` method
|
After a table has been created, you can always add more data to it using the `add` method
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
You can add any of the valid data structures accepted by LanceDB table, i.e, `dict`, `list[dict]`, `pd.DataFrame`, or `Iterator[pa.RecordBatch]`. Below are some examples.
|
You can add any of the valid data structures accepted by LanceDB table, i.e, `dict`, `list[dict]`, `pd.DataFrame`, or `Iterator[pa.RecordBatch]`. Below are some examples.
|
||||||
@@ -535,7 +535,7 @@ After a table has been created, you can always add more data to it usind the `ad
|
|||||||
```
|
```
|
||||||
|
|
||||||
??? "Ingesting Pydantic models with LanceDB embedding API"
|
??? "Ingesting Pydantic models with LanceDB embedding API"
|
||||||
When using LanceDB's embedding API, you can add Pydantic models directly to the table. LanceDB will automatically convert the `vector` field to a vector before adding it to the table. You need to specify the default value of `vector` feild as None to allow LanceDB to automatically vectorize the data.
|
When using LanceDB's embedding API, you can add Pydantic models directly to the table. LanceDB will automatically convert the `vector` field to a vector before adding it to the table. You need to specify the default value of `vector` field as None to allow LanceDB to automatically vectorize the data.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import lancedb
|
import lancedb
|
||||||
@@ -880,4 +880,4 @@ There are three possible settings for `read_consistency_interval`:
|
|||||||
|
|
||||||
Learn the best practices on creating an ANN index and getting the most out of it.
|
Learn the best practices on creating an ANN index and getting the most out of it.
|
||||||
|
|
||||||
[^1]: The `vectordb` package is a legacy package that is deprecated in favor of `@lancedb/lancedb`. The `vectordb` package will continue to receive bug fixes and security updates until September 2024. We recommend all new projects use `@lancedb/lancedb`. See the [migration guide](migration.md) for more information.
|
[^1]: The `vectordb` package is a legacy package that is deprecated in favor of `@lancedb/lancedb`. The `vectordb` package will continue to receive bug fixes and security updates until September 2024. We recommend all new projects use `@lancedb/lancedb`. See the [migration guide](../migration.md) for more information.
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ This re-ranker uses the [Cohere](https://cohere.ai/) API to rerank the search re
|
|||||||
!!! note
|
!!! note
|
||||||
Supported Query Types: Hybrid, Vector, FTS
|
Supported Query Types: Hybrid, Vector, FTS
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install cohere
|
||||||
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import numpy
|
import numpy
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ LanceDB comes with some built-in rerankers. Some of the rerankers that are avail
|
|||||||
| `CrossEncoderReranker` | Uses a cross-encoder model to rerank search results | Vector, FTS, Hybrid |
|
| `CrossEncoderReranker` | Uses a cross-encoder model to rerank search results | Vector, FTS, Hybrid |
|
||||||
| `ColbertReranker` | Uses a colbert model to rerank search results | Vector, FTS, Hybrid |
|
| `ColbertReranker` | Uses a colbert model to rerank search results | Vector, FTS, Hybrid |
|
||||||
| `OpenaiReranker`(Experimental) | Uses OpenAI's chat model to rerank search results | Vector, FTS, Hybrid |
|
| `OpenaiReranker`(Experimental) | Uses OpenAI's chat model to rerank search results | Vector, FTS, Hybrid |
|
||||||
|
| `VoyageAIReranker` | Uses voyageai Reranker API to rerank results | Vector, FTS, Hybrid |
|
||||||
|
|
||||||
|
|
||||||
## Using a Reranker
|
## Using a Reranker
|
||||||
@@ -73,6 +74,7 @@ LanceDB comes with some built-in rerankers. Here are some of the rerankers that
|
|||||||
- [Jina Reranker](./jina.md)
|
- [Jina Reranker](./jina.md)
|
||||||
- [AnswerDotAI Rerankers](./answerdotai.md)
|
- [AnswerDotAI Rerankers](./answerdotai.md)
|
||||||
- [Reciprocal Rank Fusion Reranker](./rrf.md)
|
- [Reciprocal Rank Fusion Reranker](./rrf.md)
|
||||||
|
- [VoyageAI Reranker](./voyageai.md)
|
||||||
|
|
||||||
## Creating Custom Rerankers
|
## Creating Custom Rerankers
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.13.0-beta.2</version>
|
<version>0.13.1-beta.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.13.0-beta.2</version>
|
<version>0.13.1-beta.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<name>LanceDB Parent</name>
|
<name>LanceDB Parent</name>
|
||||||
|
|||||||
24
node/package-lock.json
generated
24
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,12 +52,14 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
|
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
|
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
|
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2"
|
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
|
||||||
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0",
|
||||||
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -1441,9 +1443,9 @@
|
|||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
"node_modules/cross-spawn": {
|
"node_modules/cross-spawn": {
|
||||||
"version": "7.0.3",
|
"version": "7.0.6",
|
||||||
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
|
||||||
"integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
|
"integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"path-key": "^3.1.0",
|
"path-key": "^3.1.0",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
@@ -84,16 +84,20 @@
|
|||||||
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
|
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
|
||||||
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
|
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
|
||||||
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
|
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
|
||||||
|
"x86_64-unknown-linux-musl": "@lancedb/vectordb-linux-x64-musl",
|
||||||
|
"aarch64-unknown-linux-musl": "@lancedb/vectordb-linux-arm64-musl",
|
||||||
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc",
|
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc",
|
||||||
"aarch64-pc-windows-msvc": "@lancedb/vectordb-win32-arm64-msvc"
|
"aarch64-pc-windows-msvc": "@lancedb/vectordb-win32-arm64-msvc"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0-beta.2",
|
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0-beta.2",
|
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0-beta.2",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0-beta.2",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0-beta.2",
|
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0-beta.2"
|
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
|
||||||
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0",
|
||||||
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.13.0-beta.2"
|
version = "0.13.1-beta.0"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -477,6 +477,54 @@ describe("When creating an index", () => {
|
|||||||
expect(rst.numRows).toBe(1);
|
expect(rst.numRows).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("should create and search IVF_HNSW indices", async () => {
|
||||||
|
await tbl.createIndex("vec", {
|
||||||
|
config: Index.hnswSq(),
|
||||||
|
});
|
||||||
|
|
||||||
|
// check index directory
|
||||||
|
const indexDir = path.join(tmpDir.name, "test.lance", "_indices");
|
||||||
|
expect(fs.readdirSync(indexDir)).toHaveLength(1);
|
||||||
|
const indices = await tbl.listIndices();
|
||||||
|
expect(indices.length).toBe(1);
|
||||||
|
expect(indices[0]).toEqual({
|
||||||
|
name: "vec_idx",
|
||||||
|
indexType: "IvfHnswSq",
|
||||||
|
columns: ["vec"],
|
||||||
|
});
|
||||||
|
|
||||||
|
// Search without specifying the column
|
||||||
|
let rst = await tbl
|
||||||
|
.query()
|
||||||
|
.limit(2)
|
||||||
|
.nearestTo(queryVec)
|
||||||
|
.distanceType("dot")
|
||||||
|
.toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
|
||||||
|
// Search using `vectorSearch`
|
||||||
|
rst = await tbl.vectorSearch(queryVec).limit(2).toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
|
||||||
|
// Search with specifying the column
|
||||||
|
const rst2 = await tbl
|
||||||
|
.query()
|
||||||
|
.limit(2)
|
||||||
|
.nearestTo(queryVec)
|
||||||
|
.column("vec")
|
||||||
|
.toArrow();
|
||||||
|
expect(rst2.numRows).toBe(2);
|
||||||
|
expect(rst.toString()).toEqual(rst2.toString());
|
||||||
|
|
||||||
|
// test offset
|
||||||
|
rst = await tbl.query().limit(2).offset(1).nearestTo(queryVec).toArrow();
|
||||||
|
expect(rst.numRows).toBe(1);
|
||||||
|
|
||||||
|
// test ef
|
||||||
|
rst = await tbl.query().limit(2).nearestTo(queryVec).ef(100).toArrow();
|
||||||
|
expect(rst.numRows).toBe(2);
|
||||||
|
});
|
||||||
|
|
||||||
it("should be able to query unindexed data", async () => {
|
it("should be able to query unindexed data", async () => {
|
||||||
await tbl.createIndex("vec");
|
await tbl.createIndex("vec");
|
||||||
await tbl.add([
|
await tbl.add([
|
||||||
|
|||||||
@@ -385,6 +385,20 @@ export class VectorQuery extends QueryBase<NativeVectorQuery> {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the number of candidates to consider during the search
|
||||||
|
*
|
||||||
|
* This argument is only used when the vector column has an HNSW index.
|
||||||
|
* If there is no index then this value is ignored.
|
||||||
|
*
|
||||||
|
* Increasing this value will increase the recall of your query but will
|
||||||
|
* also increase the latency of your query. The default value is 1.5*limit.
|
||||||
|
*/
|
||||||
|
ef(ef: number): VectorQuery {
|
||||||
|
super.doCall((inner) => inner.ef(ef));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the vector column to query
|
* Set the vector column to query
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -87,6 +87,12 @@ export interface OptimizeOptions {
|
|||||||
deleteUnverified: boolean;
|
deleteUnverified: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface Version {
|
||||||
|
version: number;
|
||||||
|
timestamp: Date;
|
||||||
|
metadata: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A Table is a collection of Records in a LanceDB Database.
|
* A Table is a collection of Records in a LanceDB Database.
|
||||||
*
|
*
|
||||||
@@ -360,6 +366,11 @@ export abstract class Table {
|
|||||||
*/
|
*/
|
||||||
abstract checkoutLatest(): Promise<void>;
|
abstract checkoutLatest(): Promise<void>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List all the versions of the table
|
||||||
|
*/
|
||||||
|
abstract listVersions(): Promise<Version[]>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Restore the table to the currently checked out version
|
* Restore the table to the currently checked out version
|
||||||
*
|
*
|
||||||
@@ -659,6 +670,14 @@ export class LocalTable extends Table {
|
|||||||
await this.inner.checkoutLatest();
|
await this.inner.checkoutLatest();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async listVersions(): Promise<Version[]> {
|
||||||
|
return (await this.inner.listVersions()).map((version) => ({
|
||||||
|
version: version.version,
|
||||||
|
timestamp: new Date(version.timestamp / 1000),
|
||||||
|
metadata: version.metadata,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
async restore(): Promise<void> {
|
async restore(): Promise<void> {
|
||||||
await this.inner.restore();
|
await this.inner.restore();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
3
nodejs/npm/linux-arm64-musl/README.md
Normal file
3
nodejs/npm/linux-arm64-musl/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# `@lancedb/lancedb-linux-arm64-musl`
|
||||||
|
|
||||||
|
This is the **aarch64-unknown-linux-musl** binary for `@lancedb/lancedb`
|
||||||
13
nodejs/npm/linux-arm64-musl/package.json
Normal file
13
nodejs/npm/linux-arm64-musl/package.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
|
"version": "0.13.1-beta.0",
|
||||||
|
"os": ["linux"],
|
||||||
|
"cpu": ["arm64"],
|
||||||
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
"files": ["lancedb.linux-arm64-musl.node"],
|
||||||
|
"license": "Apache 2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 18"
|
||||||
|
},
|
||||||
|
"libc": ["musl"]
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
3
nodejs/npm/linux-x64-musl/README.md
Normal file
3
nodejs/npm/linux-x64-musl/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# `@lancedb/lancedb-linux-x64-musl`
|
||||||
|
|
||||||
|
This is the **x86_64-unknown-linux-musl** binary for `@lancedb/lancedb`
|
||||||
13
nodejs/npm/linux-x64-musl/package.json
Normal file
13
nodejs/npm/linux-x64-musl/package.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
|
"version": "0.13.1-beta.0",
|
||||||
|
"os": ["linux"],
|
||||||
|
"cpu": ["x64"],
|
||||||
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
"files": ["lancedb.linux-x64-musl.node"],
|
||||||
|
"license": "Apache 2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 18"
|
||||||
|
},
|
||||||
|
"libc": ["musl"]
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
10
nodejs/package-lock.json
generated
10
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.13.0-beta.1",
|
"version": "0.13.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -6052,9 +6052,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/cross-spawn": {
|
"node_modules/cross-spawn": {
|
||||||
"version": "7.0.3",
|
"version": "7.0.6",
|
||||||
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
|
||||||
"integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==",
|
"integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
|
||||||
"devOptional": true,
|
"devOptional": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"path-key": "^3.1.0",
|
"path-key": "^3.1.0",
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
"vector database",
|
"vector database",
|
||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"version": "0.13.0-beta.2",
|
"version": "0.13.1-beta.0",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
@@ -24,10 +24,12 @@
|
|||||||
"triples": {
|
"triples": {
|
||||||
"defaults": false,
|
"defaults": false,
|
||||||
"additional": [
|
"additional": [
|
||||||
"aarch64-apple-darwin",
|
|
||||||
"aarch64-unknown-linux-gnu",
|
|
||||||
"x86_64-apple-darwin",
|
"x86_64-apple-darwin",
|
||||||
|
"aarch64-apple-darwin",
|
||||||
"x86_64-unknown-linux-gnu",
|
"x86_64-unknown-linux-gnu",
|
||||||
|
"aarch64-unknown-linux-gnu",
|
||||||
|
"x86_64-unknown-linux-musl",
|
||||||
|
"aarch64-unknown-linux-musl",
|
||||||
"x86_64-pc-windows-msvc"
|
"x86_64-pc-windows-msvc"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -167,6 +167,11 @@ impl VectorQuery {
|
|||||||
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub fn ef(&mut self, ef: u32) {
|
||||||
|
self.inner = self.inner.clone().ef(ef as usize);
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub fn bypass_vector_index(&mut self) {
|
pub fn bypass_vector_index(&mut self) {
|
||||||
self.inner = self.inner.clone().bypass_vector_index()
|
self.inner = self.inner.clone().bypass_vector_index()
|
||||||
|
|||||||
@@ -12,6 +12,8 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use arrow_ipc::writer::FileWriter;
|
use arrow_ipc::writer::FileWriter;
|
||||||
use lancedb::ipc::ipc_file_to_batches;
|
use lancedb::ipc::ipc_file_to_batches;
|
||||||
use lancedb::table::{
|
use lancedb::table::{
|
||||||
@@ -226,6 +228,28 @@ impl Table {
|
|||||||
self.inner_ref()?.checkout_latest().await.default_error()
|
self.inner_ref()?.checkout_latest().await.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub async fn list_versions(&self) -> napi::Result<Vec<Version>> {
|
||||||
|
self.inner_ref()?
|
||||||
|
.list_versions()
|
||||||
|
.await
|
||||||
|
.map(|versions| {
|
||||||
|
versions
|
||||||
|
.iter()
|
||||||
|
.map(|version| Version {
|
||||||
|
version: version.version as i64,
|
||||||
|
timestamp: version.timestamp.timestamp_micros(),
|
||||||
|
metadata: version
|
||||||
|
.metadata
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.clone(), v.clone()))
|
||||||
|
.collect(),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.default_error()
|
||||||
|
}
|
||||||
|
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn restore(&self) -> napi::Result<()> {
|
pub async fn restore(&self) -> napi::Result<()> {
|
||||||
self.inner_ref()?.restore().await.default_error()
|
self.inner_ref()?.restore().await.default_error()
|
||||||
@@ -466,3 +490,10 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct Version {
|
||||||
|
pub version: i64,
|
||||||
|
pub timestamp: i64,
|
||||||
|
pub metadata: HashMap<String, String>,
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.16.0"
|
current_version = "0.16.1-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.16.0"
|
version = "0.16.1-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -15,13 +15,19 @@ crate-type = ["cdylib"]
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arrow = { version = "52.1", features = ["pyarrow"] }
|
arrow = { version = "52.1", features = ["pyarrow"] }
|
||||||
lancedb = { path = "../rust/lancedb" }
|
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
pyo3 = { version = "0.21", features = ["extension-module", "abi3-py38", "gil-refs"] }
|
pyo3 = { version = "0.21", features = [
|
||||||
|
"extension-module",
|
||||||
|
"abi3-py39",
|
||||||
|
"gil-refs"
|
||||||
|
] }
|
||||||
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
|
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
|
||||||
# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
||||||
pyo3-asyncio-0-21 = { version = "0.21.0", features = ["attributes", "tokio-runtime"] }
|
pyo3-asyncio-0-21 = { version = "0.21.0", features = [
|
||||||
|
"attributes",
|
||||||
|
"tokio-runtime"
|
||||||
|
] }
|
||||||
pin-project = "1.1.5"
|
pin-project = "1.1.5"
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
tokio = { version = "1.36.0", features = ["sync"] }
|
tokio = { version = "1.36.0", features = ["sync"] }
|
||||||
@@ -29,10 +35,14 @@ tokio = { version = "1.36.0", features = ["sync"] }
|
|||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
pyo3-build-config = { version = "0.20.3", features = [
|
pyo3-build-config = { version = "0.20.3", features = [
|
||||||
"extension-module",
|
"extension-module",
|
||||||
"abi3-py38",
|
"abi3-py39",
|
||||||
] }
|
] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["remote"]
|
default = ["default-tls", "remote"]
|
||||||
fp16kernels = ["lancedb/fp16kernels"]
|
fp16kernels = ["lancedb/fp16kernels"]
|
||||||
remote = ["lancedb/remote"]
|
remote = ["lancedb/remote"]
|
||||||
|
# TLS
|
||||||
|
default-tls = ["lancedb/default-tls"]
|
||||||
|
native-tls = ["lancedb/native-tls"]
|
||||||
|
rustls-tls = ["lancedb/rustls-tls"]
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ name = "lancedb"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"nest-asyncio~=1.0",
|
"nest-asyncio~=1.0",
|
||||||
"pylance==0.19.2",
|
"pylance==0.20.0b2",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
"packaging",
|
"packaging",
|
||||||
@@ -31,7 +31,6 @@ classifiers = [
|
|||||||
"Programming Language :: Python",
|
"Programming Language :: Python",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3 :: Only",
|
"Programming Language :: Python :: 3 :: Only",
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
"Programming Language :: Python :: 3.9",
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.11",
|
||||||
|
|||||||
@@ -83,25 +83,33 @@ class OpenAIEmbeddings(TextEmbeddingFunction):
|
|||||||
"""
|
"""
|
||||||
openai = attempt_import_or_raise("openai")
|
openai = attempt_import_or_raise("openai")
|
||||||
|
|
||||||
|
valid_texts = []
|
||||||
|
valid_indices = []
|
||||||
|
for idx, text in enumerate(texts):
|
||||||
|
if text:
|
||||||
|
valid_texts.append(text)
|
||||||
|
valid_indices.append(idx)
|
||||||
|
|
||||||
# TODO retry, rate limit, token limit
|
# TODO retry, rate limit, token limit
|
||||||
try:
|
try:
|
||||||
if self.name == "text-embedding-ada-002":
|
kwargs = {
|
||||||
rs = self._openai_client.embeddings.create(input=texts, model=self.name)
|
"input": valid_texts,
|
||||||
else:
|
"model": self.name,
|
||||||
kwargs = {
|
}
|
||||||
"input": texts,
|
if self.name != "text-embedding-ada-002":
|
||||||
"model": self.name,
|
kwargs["dimensions"] = self.dim
|
||||||
}
|
|
||||||
if self.dim:
|
rs = self._openai_client.embeddings.create(**kwargs)
|
||||||
kwargs["dimensions"] = self.dim
|
valid_embeddings = {
|
||||||
rs = self._openai_client.embeddings.create(**kwargs)
|
idx: v.embedding for v, idx in zip(rs.data, valid_indices)
|
||||||
|
}
|
||||||
except openai.BadRequestError:
|
except openai.BadRequestError:
|
||||||
logging.exception("Bad request: %s", texts)
|
logging.exception("Bad request: %s", texts)
|
||||||
return [None] * len(texts)
|
return [None] * len(texts)
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.exception("OpenAI embeddings error")
|
logging.exception("OpenAI embeddings error")
|
||||||
raise
|
raise
|
||||||
return [v.embedding for v in rs.data]
|
return [valid_embeddings.get(idx, None) for idx in range(len(texts))]
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _openai_client(self):
|
def _openai_client(self):
|
||||||
|
|||||||
@@ -1,15 +1,5 @@
|
|||||||
# Copyright 2023 LanceDB Developers
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""Pydantic (v1 / v2) adapter for LanceDB"""
|
"""Pydantic (v1 / v2) adapter for LanceDB"""
|
||||||
|
|
||||||
@@ -30,6 +20,7 @@ from typing import (
|
|||||||
Type,
|
Type,
|
||||||
Union,
|
Union,
|
||||||
_GenericAlias,
|
_GenericAlias,
|
||||||
|
GenericAlias,
|
||||||
)
|
)
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -75,7 +66,7 @@ def vector(dim: int, value_type: pa.DataType = pa.float32()):
|
|||||||
|
|
||||||
|
|
||||||
def Vector(
|
def Vector(
|
||||||
dim: int, value_type: pa.DataType = pa.float32()
|
dim: int, value_type: pa.DataType = pa.float32(), nullable: bool = True
|
||||||
) -> Type[FixedSizeListMixin]:
|
) -> Type[FixedSizeListMixin]:
|
||||||
"""Pydantic Vector Type.
|
"""Pydantic Vector Type.
|
||||||
|
|
||||||
@@ -88,6 +79,8 @@ def Vector(
|
|||||||
The dimension of the vector.
|
The dimension of the vector.
|
||||||
value_type : pyarrow.DataType, optional
|
value_type : pyarrow.DataType, optional
|
||||||
The value type of the vector, by default pa.float32()
|
The value type of the vector, by default pa.float32()
|
||||||
|
nullable : bool, optional
|
||||||
|
Whether the vector is nullable, by default it is True.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -103,7 +96,7 @@ def Vector(
|
|||||||
>>> assert schema == pa.schema([
|
>>> assert schema == pa.schema([
|
||||||
... pa.field("id", pa.int64(), False),
|
... pa.field("id", pa.int64(), False),
|
||||||
... pa.field("url", pa.utf8(), False),
|
... pa.field("url", pa.utf8(), False),
|
||||||
... pa.field("embeddings", pa.list_(pa.float32(), 768), False)
|
... pa.field("embeddings", pa.list_(pa.float32(), 768))
|
||||||
... ])
|
... ])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -112,6 +105,10 @@ def Vector(
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"FixedSizeList(dim={dim})"
|
return f"FixedSizeList(dim={dim})"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def nullable() -> bool:
|
||||||
|
return nullable
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def dim() -> int:
|
def dim() -> int:
|
||||||
return dim
|
return dim
|
||||||
@@ -205,9 +202,7 @@ else:
|
|||||||
def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
|
def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
|
||||||
"""Convert a Pydantic FieldInfo to Arrow DataType"""
|
"""Convert a Pydantic FieldInfo to Arrow DataType"""
|
||||||
|
|
||||||
if isinstance(field.annotation, _GenericAlias) or (
|
if isinstance(field.annotation, (_GenericAlias, GenericAlias)):
|
||||||
sys.version_info > (3, 9) and isinstance(field.annotation, types.GenericAlias)
|
|
||||||
):
|
|
||||||
origin = field.annotation.__origin__
|
origin = field.annotation.__origin__
|
||||||
args = field.annotation.__args__
|
args = field.annotation.__args__
|
||||||
if origin is list:
|
if origin is list:
|
||||||
@@ -235,7 +230,7 @@ def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
|
|||||||
|
|
||||||
def is_nullable(field: FieldInfo) -> bool:
|
def is_nullable(field: FieldInfo) -> bool:
|
||||||
"""Check if a Pydantic FieldInfo is nullable."""
|
"""Check if a Pydantic FieldInfo is nullable."""
|
||||||
if isinstance(field.annotation, _GenericAlias):
|
if isinstance(field.annotation, (_GenericAlias, GenericAlias)):
|
||||||
origin = field.annotation.__origin__
|
origin = field.annotation.__origin__
|
||||||
args = field.annotation.__args__
|
args = field.annotation.__args__
|
||||||
if origin == Union:
|
if origin == Union:
|
||||||
@@ -246,6 +241,10 @@ def is_nullable(field: FieldInfo) -> bool:
|
|||||||
for typ in args:
|
for typ in args:
|
||||||
if typ is type(None):
|
if typ is type(None):
|
||||||
return True
|
return True
|
||||||
|
elif inspect.isclass(field.annotation) and issubclass(
|
||||||
|
field.annotation, FixedSizeListMixin
|
||||||
|
):
|
||||||
|
return field.annotation.nullable()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -131,6 +131,8 @@ class Query(pydantic.BaseModel):
|
|||||||
|
|
||||||
fast_search: bool = False
|
fast_search: bool = False
|
||||||
|
|
||||||
|
ef: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
class LanceQueryBuilder(ABC):
|
class LanceQueryBuilder(ABC):
|
||||||
"""An abstract query builder. Subclasses are defined for vector search,
|
"""An abstract query builder. Subclasses are defined for vector search,
|
||||||
@@ -257,6 +259,7 @@ class LanceQueryBuilder(ABC):
|
|||||||
self._with_row_id = False
|
self._with_row_id = False
|
||||||
self._vector = None
|
self._vector = None
|
||||||
self._text = None
|
self._text = None
|
||||||
|
self._ef = None
|
||||||
|
|
||||||
@deprecation.deprecated(
|
@deprecation.deprecated(
|
||||||
deprecated_in="0.3.1",
|
deprecated_in="0.3.1",
|
||||||
@@ -367,11 +370,13 @@ class LanceQueryBuilder(ABC):
|
|||||||
----------
|
----------
|
||||||
limit: int
|
limit: int
|
||||||
The maximum number of results to return.
|
The maximum number of results to return.
|
||||||
By default the query is limited to the first 10.
|
The default query limit is 10 results.
|
||||||
Call this method and pass 0, a negative value,
|
For ANN/KNN queries, you must specify a limit.
|
||||||
or None to remove the limit.
|
Entering 0, a negative number, or None will reset
|
||||||
*WARNING* if you have a large dataset, removing
|
the limit to the default value of 10.
|
||||||
the limit can potentially result in reading a
|
*WARNING* if you have a large dataset, setting
|
||||||
|
the limit to a large number, e.g. the table size,
|
||||||
|
can potentially result in reading a
|
||||||
large amount of data into memory and cause
|
large amount of data into memory and cause
|
||||||
out of memory issues.
|
out of memory issues.
|
||||||
|
|
||||||
@@ -638,6 +643,28 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
self._nprobes = nprobes
|
self._nprobes = nprobes
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def ef(self, ef: int) -> LanceVectorQueryBuilder:
|
||||||
|
"""Set the number of candidates to consider during search.
|
||||||
|
|
||||||
|
Higher values will yield better recall (more likely to find vectors if
|
||||||
|
they exist) at the expense of latency.
|
||||||
|
|
||||||
|
This only applies to the HNSW-related index.
|
||||||
|
The default value is 1.5 * limit.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ef: int
|
||||||
|
The number of candidates to consider during search.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
LanceVectorQueryBuilder
|
||||||
|
The LanceQueryBuilder object.
|
||||||
|
"""
|
||||||
|
self._ef = ef
|
||||||
|
return self
|
||||||
|
|
||||||
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
|
def refine_factor(self, refine_factor: int) -> LanceVectorQueryBuilder:
|
||||||
"""Set the refine factor to use, increasing the number of vectors sampled.
|
"""Set the refine factor to use, increasing the number of vectors sampled.
|
||||||
|
|
||||||
@@ -700,6 +727,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
|||||||
with_row_id=self._with_row_id,
|
with_row_id=self._with_row_id,
|
||||||
offset=self._offset,
|
offset=self._offset,
|
||||||
fast_search=self._fast_search,
|
fast_search=self._fast_search,
|
||||||
|
ef=self._ef,
|
||||||
)
|
)
|
||||||
result_set = self._table._execute_query(query, batch_size)
|
result_set = self._table._execute_query(query, batch_size)
|
||||||
if self._reranker is not None:
|
if self._reranker is not None:
|
||||||
@@ -1071,6 +1099,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
self._vector_query.nprobes(self._nprobes)
|
self._vector_query.nprobes(self._nprobes)
|
||||||
if self._refine_factor:
|
if self._refine_factor:
|
||||||
self._vector_query.refine_factor(self._refine_factor)
|
self._vector_query.refine_factor(self._refine_factor)
|
||||||
|
if self._ef:
|
||||||
|
self._vector_query.ef(self._ef)
|
||||||
|
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
fts_future = executor.submit(self._fts_query.with_row_id(True).to_arrow)
|
fts_future = executor.submit(self._fts_query.with_row_id(True).to_arrow)
|
||||||
@@ -1197,6 +1227,29 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
|
|||||||
self._nprobes = nprobes
|
self._nprobes = nprobes
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def ef(self, ef: int) -> LanceHybridQueryBuilder:
|
||||||
|
"""
|
||||||
|
Set the number of candidates to consider during search.
|
||||||
|
|
||||||
|
Higher values will yield better recall (more likely to find vectors if
|
||||||
|
they exist) at the expense of latency.
|
||||||
|
|
||||||
|
This only applies to the HNSW-related index.
|
||||||
|
The default value is 1.5 * limit.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
ef: int
|
||||||
|
The number of candidates to consider during search.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
LanceHybridQueryBuilder
|
||||||
|
The LanceHybridQueryBuilder object.
|
||||||
|
"""
|
||||||
|
self._ef = ef
|
||||||
|
return self
|
||||||
|
|
||||||
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
|
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
|
||||||
"""Set the distance metric to use.
|
"""Set the distance metric to use.
|
||||||
|
|
||||||
@@ -1495,7 +1548,8 @@ class AsyncQuery(AsyncQueryBase):
|
|||||||
return pa.array(vec)
|
return pa.array(vec)
|
||||||
|
|
||||||
def nearest_to(
|
def nearest_to(
|
||||||
self, query_vector: Optional[Union[VEC, Tuple, List[VEC]]] = None
|
self,
|
||||||
|
query_vector: Union[VEC, Tuple, List[VEC]],
|
||||||
) -> AsyncVectorQuery:
|
) -> AsyncVectorQuery:
|
||||||
"""
|
"""
|
||||||
Find the nearest vectors to the given query vector.
|
Find the nearest vectors to the given query vector.
|
||||||
@@ -1542,6 +1596,9 @@ class AsyncQuery(AsyncQueryBase):
|
|||||||
will be added to the results. This column will contain the index of the
|
will be added to the results. This column will contain the index of the
|
||||||
query vector that the result is nearest to.
|
query vector that the result is nearest to.
|
||||||
"""
|
"""
|
||||||
|
if query_vector is None:
|
||||||
|
raise ValueError("query_vector can not be None")
|
||||||
|
|
||||||
if (
|
if (
|
||||||
isinstance(query_vector, list)
|
isinstance(query_vector, list)
|
||||||
and len(query_vector) > 0
|
and len(query_vector) > 0
|
||||||
@@ -1618,7 +1675,7 @@ class AsyncVectorQuery(AsyncQueryBase):
|
|||||||
"""
|
"""
|
||||||
Set the number of partitions to search (probe)
|
Set the number of partitions to search (probe)
|
||||||
|
|
||||||
This argument is only used when the vector column has an IVF PQ index.
|
This argument is only used when the vector column has an IVF-based index.
|
||||||
If there is no index then this value is ignored.
|
If there is no index then this value is ignored.
|
||||||
|
|
||||||
The IVF stage of IVF PQ divides the input into partitions (clusters) of
|
The IVF stage of IVF PQ divides the input into partitions (clusters) of
|
||||||
@@ -1640,6 +1697,21 @@ class AsyncVectorQuery(AsyncQueryBase):
|
|||||||
self._inner.nprobes(nprobes)
|
self._inner.nprobes(nprobes)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def ef(self, ef: int) -> AsyncVectorQuery:
|
||||||
|
"""
|
||||||
|
Set the number of candidates to consider during search
|
||||||
|
|
||||||
|
This argument is only used when the vector column has an HNSW index.
|
||||||
|
If there is no index then this value is ignored.
|
||||||
|
|
||||||
|
Increasing this value will increase the recall of your query but will also
|
||||||
|
increase the latency of your query. The default value is 1.5 * limit. This
|
||||||
|
default is good for many cases but the best value to use will depend on your
|
||||||
|
data and the recall that you need to achieve.
|
||||||
|
"""
|
||||||
|
self._inner.ef(ef)
|
||||||
|
return self
|
||||||
|
|
||||||
def refine_factor(self, refine_factor: int) -> AsyncVectorQuery:
|
def refine_factor(self, refine_factor: int) -> AsyncVectorQuery:
|
||||||
"""
|
"""
|
||||||
A multiplier to control how many additional rows are taken during the refine
|
A multiplier to control how many additional rows are taken during the refine
|
||||||
|
|||||||
@@ -78,6 +78,10 @@ class RemoteTable(Table):
|
|||||||
self.schema.metadata
|
self.schema.metadata
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def list_versions(self):
|
||||||
|
"""List all versions of the table"""
|
||||||
|
return self._loop.run_until_complete(self._table.list_versions())
|
||||||
|
|
||||||
def to_arrow(self) -> pa.Table:
|
def to_arrow(self) -> pa.Table:
|
||||||
"""to_arrow() is not yet supported on LanceDB cloud."""
|
"""to_arrow() is not yet supported on LanceDB cloud."""
|
||||||
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")
|
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")
|
||||||
@@ -86,6 +90,12 @@ class RemoteTable(Table):
|
|||||||
"""to_pandas() is not yet supported on LanceDB cloud."""
|
"""to_pandas() is not yet supported on LanceDB cloud."""
|
||||||
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
return NotImplementedError("to_pandas() is not yet supported on LanceDB cloud.")
|
||||||
|
|
||||||
|
def checkout(self, version):
|
||||||
|
return self._loop.run_until_complete(self._table.checkout(version))
|
||||||
|
|
||||||
|
def checkout_latest(self):
|
||||||
|
return self._loop.run_until_complete(self._table.checkout_latest())
|
||||||
|
|
||||||
def list_indices(self):
|
def list_indices(self):
|
||||||
"""List all the indices on the table"""
|
"""List all the indices on the table"""
|
||||||
return self._loop.run_until_complete(self._table.list_indices())
|
return self._loop.run_until_complete(self._table.list_indices())
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ class CohereReranker(Reranker):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_name: str = "rerank-english-v2.0",
|
model_name: str = "rerank-english-v3.0",
|
||||||
column: str = "text",
|
column: str = "text",
|
||||||
top_n: Union[int, None] = None,
|
top_n: Union[int, None] = None,
|
||||||
return_score="relevance",
|
return_score="relevance",
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import inspect
|
|||||||
import time
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import timedelta
|
from datetime import datetime, timedelta
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
@@ -1012,6 +1012,39 @@ class Table(ABC):
|
|||||||
The names of the columns to drop.
|
The names of the columns to drop.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def checkout(self):
|
||||||
|
"""
|
||||||
|
Checks out a specific version of the Table
|
||||||
|
|
||||||
|
Any read operation on the table will now access the data at the checked out
|
||||||
|
version. As a consequence, calling this method will disable any read consistency
|
||||||
|
interval that was previously set.
|
||||||
|
|
||||||
|
This is a read-only operation that turns the table into a sort of "view"
|
||||||
|
or "detached head". Other table instances will not be affected. To make the
|
||||||
|
change permanent you can use the `[Self::restore]` method.
|
||||||
|
|
||||||
|
Any operation that modifies the table will fail while the table is in a checked
|
||||||
|
out state.
|
||||||
|
|
||||||
|
To return the table to a normal state use `[Self::checkout_latest]`
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def checkout_latest(self):
|
||||||
|
"""
|
||||||
|
Ensures the table is pointing at the latest version
|
||||||
|
|
||||||
|
This can be used to manually update a table when the read_consistency_interval
|
||||||
|
is None
|
||||||
|
It can also be used to undo a `[Self::checkout]` operation
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def list_versions(self):
|
||||||
|
"""List all versions of the table"""
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _dataset_uri(self) -> str:
|
def _dataset_uri(self) -> str:
|
||||||
return _table_uri(self._conn.uri, self.name)
|
return _table_uri(self._conn.uri, self.name)
|
||||||
@@ -1959,6 +1992,7 @@ class LanceTable(Table):
|
|||||||
"metric": query.metric,
|
"metric": query.metric,
|
||||||
"nprobes": query.nprobes,
|
"nprobes": query.nprobes,
|
||||||
"refine_factor": query.refine_factor,
|
"refine_factor": query.refine_factor,
|
||||||
|
"ef": query.ef,
|
||||||
}
|
}
|
||||||
return ds.scanner(
|
return ds.scanner(
|
||||||
columns=query.columns,
|
columns=query.columns,
|
||||||
@@ -2697,7 +2731,7 @@ class AsyncTable:
|
|||||||
|
|
||||||
def vector_search(
|
def vector_search(
|
||||||
self,
|
self,
|
||||||
query_vector: Optional[Union[VEC, Tuple]] = None,
|
query_vector: Union[VEC, Tuple],
|
||||||
) -> AsyncVectorQuery:
|
) -> AsyncVectorQuery:
|
||||||
"""
|
"""
|
||||||
Search the table with a given query vector.
|
Search the table with a given query vector.
|
||||||
@@ -2736,6 +2770,8 @@ class AsyncTable:
|
|||||||
async_query = async_query.refine_factor(query.refine_factor)
|
async_query = async_query.refine_factor(query.refine_factor)
|
||||||
if query.vector_column:
|
if query.vector_column:
|
||||||
async_query = async_query.column(query.vector_column)
|
async_query = async_query.column(query.vector_column)
|
||||||
|
if query.ef:
|
||||||
|
async_query = async_query.ef(query.ef)
|
||||||
|
|
||||||
if not query.prefilter:
|
if not query.prefilter:
|
||||||
async_query = async_query.postfilter()
|
async_query = async_query.postfilter()
|
||||||
@@ -2899,6 +2935,19 @@ class AsyncTable:
|
|||||||
"""
|
"""
|
||||||
return await self._inner.version()
|
return await self._inner.version()
|
||||||
|
|
||||||
|
async def list_versions(self):
|
||||||
|
"""
|
||||||
|
List all versions of the table
|
||||||
|
"""
|
||||||
|
versions = await self._inner.list_versions()
|
||||||
|
for v in versions:
|
||||||
|
ts_nanos = v["timestamp"]
|
||||||
|
v["timestamp"] = datetime.fromtimestamp(ts_nanos // 1e9) + timedelta(
|
||||||
|
microseconds=(ts_nanos % 1e9) // 1e3
|
||||||
|
)
|
||||||
|
|
||||||
|
return versions
|
||||||
|
|
||||||
async def checkout(self, version):
|
async def checkout(self, version):
|
||||||
"""
|
"""
|
||||||
Checks out a specific version of the Table
|
Checks out a specific version of the Table
|
||||||
|
|||||||
@@ -11,6 +11,8 @@ from datetime import date, datetime
|
|||||||
from functools import singledispatch
|
from functools import singledispatch
|
||||||
from typing import Tuple, Union, Optional, Any
|
from typing import Tuple, Union, Optional, Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from threading import Lock
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
@@ -314,3 +316,27 @@ def deprecated(func):
|
|||||||
def validate_table_name(name: str):
|
def validate_table_name(name: str):
|
||||||
"""Verify the table name is valid."""
|
"""Verify the table name is valid."""
|
||||||
native_validate_table_name(name)
|
native_validate_table_name(name)
|
||||||
|
|
||||||
|
|
||||||
|
class ConnectionPool:
|
||||||
|
def __init__(self, connection_factory, *, max_size: Optional[int] = None):
|
||||||
|
self.max_size = max_size
|
||||||
|
self._connection_factory = connection_factory
|
||||||
|
self._pool = []
|
||||||
|
self._lock = Lock()
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def connection(self):
|
||||||
|
with self._lock:
|
||||||
|
if self._pool:
|
||||||
|
conn = self._pool.pop()
|
||||||
|
else:
|
||||||
|
conn = self._connection_factory()
|
||||||
|
|
||||||
|
# release the lock before yielding
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
finally:
|
||||||
|
with self._lock:
|
||||||
|
if self.max_size is None or len(self._pool) < self.max_size:
|
||||||
|
self._pool.append(conn)
|
||||||
|
|||||||
@@ -90,10 +90,13 @@ def test_embedding_with_bad_results(tmp_path):
|
|||||||
self, texts: Union[List[str], np.ndarray]
|
self, texts: Union[List[str], np.ndarray]
|
||||||
) -> list[Union[np.array, None]]:
|
) -> list[Union[np.array, None]]:
|
||||||
# Return None, which is bad if field is non-nullable
|
# Return None, which is bad if field is non-nullable
|
||||||
return [
|
a = [
|
||||||
None if i % 2 == 0 else np.random.randn(self.ndims())
|
np.full(self.ndims(), np.nan)
|
||||||
|
if i % 2 == 0
|
||||||
|
else np.random.randn(self.ndims())
|
||||||
for i in range(len(texts))
|
for i in range(len(texts))
|
||||||
]
|
]
|
||||||
|
return a
|
||||||
|
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
registry = EmbeddingFunctionRegistry.get_instance()
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
|||||||
@@ -1,15 +1,6 @@
|
|||||||
# Copyright (c) 2023. LanceDB Developers
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import importlib
|
import importlib
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
@@ -17,6 +8,7 @@ import os
|
|||||||
import lancedb
|
import lancedb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
from lancedb.embeddings import get_registry
|
from lancedb.embeddings import get_registry
|
||||||
from lancedb.pydantic import LanceModel, Vector
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
@@ -444,6 +436,30 @@ def test_watsonx_embedding(tmp_path):
|
|||||||
assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world"
|
assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
os.environ.get("OPENAI_API_KEY") is None, reason="OPENAI_API_KEY not set"
|
||||||
|
)
|
||||||
|
def test_openai_with_empty_strs(tmp_path):
|
||||||
|
model = get_registry().get("openai").create(max_retries=0)
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = model.SourceField()
|
||||||
|
vector: Vector(model.ndims()) = model.VectorField()
|
||||||
|
|
||||||
|
df = pd.DataFrame({"text": ["hello world", ""]})
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(df, on_bad_vectors="skip")
|
||||||
|
tb = tbl.to_arrow()
|
||||||
|
assert tb.schema.field_by_name("vector").type == pa.list_(
|
||||||
|
pa.float32(), model.ndims()
|
||||||
|
)
|
||||||
|
assert len(tb) == 2
|
||||||
|
assert tb["vector"].is_null().to_pylist() == [False, True]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
importlib.util.find_spec("ollama") is None, reason="Ollama not installed"
|
importlib.util.find_spec("ollama") is None, reason="Ollama not installed"
|
||||||
|
|||||||
@@ -1,16 +1,5 @@
|
|||||||
# Copyright 2023 LanceDB Developers
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
@@ -172,6 +161,26 @@ def test_pydantic_to_arrow_py38():
|
|||||||
assert schema == expect_schema
|
assert schema == expect_schema
|
||||||
|
|
||||||
|
|
||||||
|
def test_nullable_vector():
|
||||||
|
class NullableModel(pydantic.BaseModel):
|
||||||
|
vec: Vector(16, nullable=False)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(NullableModel)
|
||||||
|
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), False)])
|
||||||
|
|
||||||
|
class DefaultModel(pydantic.BaseModel):
|
||||||
|
vec: Vector(16)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(DefaultModel)
|
||||||
|
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), True)])
|
||||||
|
|
||||||
|
class NotNullableModel(pydantic.BaseModel):
|
||||||
|
vec: Vector(16)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(NotNullableModel)
|
||||||
|
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), True)])
|
||||||
|
|
||||||
|
|
||||||
def test_fixed_size_list_field():
|
def test_fixed_size_list_field():
|
||||||
class TestModel(pydantic.BaseModel):
|
class TestModel(pydantic.BaseModel):
|
||||||
vec: Vector(16)
|
vec: Vector(16)
|
||||||
@@ -192,7 +201,7 @@ def test_fixed_size_list_field():
|
|||||||
schema = pydantic_to_schema(TestModel)
|
schema = pydantic_to_schema(TestModel)
|
||||||
assert schema == pa.schema(
|
assert schema == pa.schema(
|
||||||
[
|
[
|
||||||
pa.field("vec", pa.list_(pa.float32(), 16), False),
|
pa.field("vec", pa.list_(pa.float32(), 16)),
|
||||||
pa.field("li", pa.list_(pa.int64()), False),
|
pa.field("li", pa.list_(pa.int64()), False),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,21 +1,9 @@
|
|||||||
# Copyright 2023 LanceDB Developers
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import unittest.mock as mock
|
import unittest.mock as mock
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import lance
|
|
||||||
import lancedb
|
import lancedb
|
||||||
from lancedb.index import IvfPq
|
from lancedb.index import IvfPq
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -23,41 +11,15 @@ import pandas.testing as tm
|
|||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
from lancedb.db import LanceDBConnection
|
|
||||||
from lancedb.pydantic import LanceModel, Vector
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
from lancedb.query import AsyncQueryBase, LanceVectorQueryBuilder, Query
|
from lancedb.query import AsyncQueryBase, LanceVectorQueryBuilder, Query
|
||||||
from lancedb.table import AsyncTable, LanceTable
|
from lancedb.table import AsyncTable, LanceTable
|
||||||
|
|
||||||
|
|
||||||
class MockTable:
|
@pytest.fixture(scope="module")
|
||||||
def __init__(self, tmp_path):
|
def table(tmpdir_factory) -> lancedb.table.Table:
|
||||||
self.uri = tmp_path
|
tmp_path = str(tmpdir_factory.mktemp("data"))
|
||||||
self._conn = LanceDBConnection(self.uri)
|
db = lancedb.connect(tmp_path)
|
||||||
|
|
||||||
def to_lance(self):
|
|
||||||
return lance.dataset(self.uri)
|
|
||||||
|
|
||||||
def _execute_query(self, query, batch_size: Optional[int] = None):
|
|
||||||
ds = self.to_lance()
|
|
||||||
return ds.scanner(
|
|
||||||
columns=query.columns,
|
|
||||||
filter=query.filter,
|
|
||||||
prefilter=query.prefilter,
|
|
||||||
nearest={
|
|
||||||
"column": query.vector_column,
|
|
||||||
"q": query.vector,
|
|
||||||
"k": query.k,
|
|
||||||
"metric": query.metric,
|
|
||||||
"nprobes": query.nprobes,
|
|
||||||
"refine_factor": query.refine_factor,
|
|
||||||
},
|
|
||||||
batch_size=batch_size,
|
|
||||||
offset=query.offset,
|
|
||||||
).to_reader()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def table(tmp_path) -> MockTable:
|
|
||||||
df = pa.table(
|
df = pa.table(
|
||||||
{
|
{
|
||||||
"vector": pa.array(
|
"vector": pa.array(
|
||||||
@@ -68,8 +30,7 @@ def table(tmp_path) -> MockTable:
|
|||||||
"float_field": pa.array([1.0, 2.0]),
|
"float_field": pa.array([1.0, 2.0]),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
lance.write_dataset(df, tmp_path)
|
return db.create_table("test", df)
|
||||||
return MockTable(tmp_path)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest_asyncio.fixture
|
||||||
@@ -126,6 +87,12 @@ def test_query_builder(table):
|
|||||||
assert all(np.array(rs[0]["vector"]) == [1, 2])
|
assert all(np.array(rs[0]["vector"]) == [1, 2])
|
||||||
|
|
||||||
|
|
||||||
|
def test_with_row_id(table: lancedb.table.Table):
|
||||||
|
rs = table.search().with_row_id(True).to_arrow()
|
||||||
|
assert "_rowid" in rs.column_names
|
||||||
|
assert rs["_rowid"].to_pylist() == [0, 1]
|
||||||
|
|
||||||
|
|
||||||
def test_vector_query_with_no_limit(table):
|
def test_vector_query_with_no_limit(table):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
LanceVectorQueryBuilder(table, [0, 0], "vector").limit(0).select(
|
LanceVectorQueryBuilder(table, [0, 0], "vector").limit(0).select(
|
||||||
@@ -365,6 +332,12 @@ async def test_query_to_pandas_async(table_async: AsyncTable):
|
|||||||
assert df.shape == (0, 4)
|
assert df.shape == (0, 4)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_none_query(table_async: AsyncTable):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
await table_async.query().nearest_to(None).to_arrow()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_fast_search_async(tmp_path):
|
async def test_fast_search_async(tmp_path):
|
||||||
db = await lancedb.connect_async(tmp_path)
|
db = await lancedb.connect_async(tmp_path)
|
||||||
|
|||||||
@@ -6,13 +6,16 @@ from datetime import timedelta
|
|||||||
import http.server
|
import http.server
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
import lancedb
|
import lancedb
|
||||||
from lancedb.conftest import MockTextEmbeddingFunction
|
from lancedb.conftest import MockTextEmbeddingFunction
|
||||||
from lancedb.remote import ClientConfig
|
from lancedb.remote import ClientConfig
|
||||||
|
from lancedb.util import ConnectionPool
|
||||||
from lancedb.remote.errors import HttpError, RetryError
|
from lancedb.remote.errors import HttpError, RetryError
|
||||||
|
import lancedb.util
|
||||||
import pytest
|
import pytest
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
@@ -55,6 +58,34 @@ def mock_lancedb_connection(handler):
|
|||||||
handle.join()
|
handle.join()
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def mock_lancedb_connection_pool(handler):
|
||||||
|
with http.server.HTTPServer(
|
||||||
|
("localhost", 8080), make_mock_http_handler(handler)
|
||||||
|
) as server:
|
||||||
|
handle = threading.Thread(target=server.serve_forever)
|
||||||
|
handle.start()
|
||||||
|
|
||||||
|
def conn_factory():
|
||||||
|
lancedb.connect(
|
||||||
|
"db://dev",
|
||||||
|
api_key="fake",
|
||||||
|
host_override="http://localhost:8080",
|
||||||
|
client_config={
|
||||||
|
"retry_config": {"retries": 2},
|
||||||
|
"timeout_config": {
|
||||||
|
"connect_timeout": 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield ConnectionPool(conn_factory)
|
||||||
|
finally:
|
||||||
|
server.shutdown()
|
||||||
|
handle.join()
|
||||||
|
|
||||||
|
|
||||||
@contextlib.asynccontextmanager
|
@contextlib.asynccontextmanager
|
||||||
async def mock_lancedb_connection_async(handler):
|
async def mock_lancedb_connection_async(handler):
|
||||||
with http.server.HTTPServer(
|
with http.server.HTTPServer(
|
||||||
@@ -103,6 +134,47 @@ async def test_async_remote_db():
|
|||||||
assert table_names == []
|
assert table_names == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_async_checkout():
|
||||||
|
def handler(request):
|
||||||
|
if request.path == "/v1/table/test/describe/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
response = json.dumps({"version": 42, "schema": {"fields": []}})
|
||||||
|
request.wfile.write(response.encode())
|
||||||
|
return
|
||||||
|
|
||||||
|
content_len = int(request.headers.get("Content-Length"))
|
||||||
|
body = request.rfile.read(content_len)
|
||||||
|
body = json.loads(body)
|
||||||
|
|
||||||
|
print("body is", body)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
if body["version"] == 1:
|
||||||
|
count = 100
|
||||||
|
elif body["version"] == 2:
|
||||||
|
count = 200
|
||||||
|
elif body["version"] is None:
|
||||||
|
count = 300
|
||||||
|
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
request.wfile.write(json.dumps(count).encode())
|
||||||
|
|
||||||
|
async with mock_lancedb_connection_async(handler) as db:
|
||||||
|
table = await db.open_table("test")
|
||||||
|
assert await table.count_rows() == 300
|
||||||
|
await table.checkout(1)
|
||||||
|
assert await table.count_rows() == 100
|
||||||
|
await table.checkout(2)
|
||||||
|
assert await table.count_rows() == 200
|
||||||
|
await table.checkout_latest()
|
||||||
|
assert await table.count_rows() == 300
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_http_error():
|
async def test_http_error():
|
||||||
request_id_holder = {"request_id": None}
|
request_id_holder = {"request_id": None}
|
||||||
@@ -146,8 +218,7 @@ async def test_retry_error():
|
|||||||
assert cause.status_code == 429
|
assert cause.status_code == 429
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
def http_handler(query_handler):
|
||||||
def query_test_table(query_handler):
|
|
||||||
def handler(request):
|
def handler(request):
|
||||||
if request.path == "/v1/table/test/describe/":
|
if request.path == "/v1/table/test/describe/":
|
||||||
request.send_response(200)
|
request.send_response(200)
|
||||||
@@ -171,7 +242,12 @@ def query_test_table(query_handler):
|
|||||||
request.send_response(404)
|
request.send_response(404)
|
||||||
request.end_headers()
|
request.end_headers()
|
||||||
|
|
||||||
with mock_lancedb_connection(handler) as db:
|
return handler
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def query_test_table(connection_ctx_mgr):
|
||||||
|
with connection_ctx_mgr as db:
|
||||||
assert repr(db) == "RemoteConnect(name=dev)"
|
assert repr(db) == "RemoteConnect(name=dev)"
|
||||||
table = db.open_table("test")
|
table = db.open_table("test")
|
||||||
assert repr(table) == "RemoteTable(dev.test)"
|
assert repr(table) == "RemoteTable(dev.test)"
|
||||||
@@ -179,36 +255,85 @@ def query_test_table(query_handler):
|
|||||||
|
|
||||||
|
|
||||||
def test_query_sync_minimal():
|
def test_query_sync_minimal():
|
||||||
|
@http_handler
|
||||||
def handler(body):
|
def handler(body):
|
||||||
assert body == {
|
assert body == {
|
||||||
"distance_type": "l2",
|
"distance_type": "l2",
|
||||||
"k": 10,
|
"k": 10,
|
||||||
"prefilter": False,
|
"prefilter": False,
|
||||||
"refine_factor": None,
|
"refine_factor": None,
|
||||||
|
"ef": None,
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
|
|
||||||
with query_test_table(handler) as table:
|
with query_test_table(mock_lancedb_connection(handler)) as table:
|
||||||
|
data = table.search([1, 2, 3]).to_list()
|
||||||
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
|
assert data == expected
|
||||||
|
|
||||||
|
with query_test_table(mock_lancedb_connection_pool(handler).connection()) as table:
|
||||||
data = table.search([1, 2, 3]).to_list()
|
data = table.search([1, 2, 3]).to_list()
|
||||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
assert data == expected
|
assert data == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_sync_minimal_threaded():
|
||||||
|
num_query = 0
|
||||||
|
|
||||||
|
@http_handler
|
||||||
|
def handler(body):
|
||||||
|
assert body == {
|
||||||
|
"distance_type": "l2",
|
||||||
|
"k": 10,
|
||||||
|
"prefilter": False,
|
||||||
|
"refine_factor": None,
|
||||||
|
"ef": None,
|
||||||
|
"vector": [1.0, 2.0, 3.0],
|
||||||
|
"nprobes": 20,
|
||||||
|
"version": None,
|
||||||
|
}
|
||||||
|
nonlocal num_query
|
||||||
|
num_query += 1
|
||||||
|
|
||||||
|
return pa.table({"id": [1, 2, 3]})
|
||||||
|
|
||||||
|
pool = mock_lancedb_connection_pool(handler)
|
||||||
|
|
||||||
|
def _query(i):
|
||||||
|
with query_test_table(pool.connection()) as table:
|
||||||
|
data = table.search([1, 2, 3]).to_list()
|
||||||
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
|
assert data == expected
|
||||||
|
|
||||||
|
with ThreadPoolExecutor as exec:
|
||||||
|
exec.map(_query, range(1000))
|
||||||
|
|
||||||
|
assert num_query == 1000
|
||||||
|
|
||||||
|
|
||||||
def test_query_sync_empty_query():
|
def test_query_sync_empty_query():
|
||||||
|
@http_handler
|
||||||
def handler(body):
|
def handler(body):
|
||||||
assert body == {
|
assert body == {
|
||||||
"k": 10,
|
"k": 10,
|
||||||
"filter": "true",
|
"filter": "true",
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"columns": ["id"],
|
"columns": ["id"],
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
|
|
||||||
with query_test_table(handler) as table:
|
with query_test_table(mock_lancedb_connection(handler)) as table:
|
||||||
|
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
||||||
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
|
assert data == expected
|
||||||
|
|
||||||
|
with query_test_table(mock_lancedb_connection_pool(handler).connection()) as table:
|
||||||
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
||||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
assert data == expected
|
assert data == expected
|
||||||
@@ -223,11 +348,13 @@ def test_query_sync_maximal():
|
|||||||
"refine_factor": 10,
|
"refine_factor": 10,
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"nprobes": 5,
|
"nprobes": 5,
|
||||||
|
"ef": None,
|
||||||
"filter": "id > 0",
|
"filter": "id > 0",
|
||||||
"columns": ["id", "name"],
|
"columns": ["id", "name"],
|
||||||
"vector_column": "vector2",
|
"vector_column": "vector2",
|
||||||
"fast_search": True,
|
"fast_search": True,
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
||||||
@@ -266,6 +393,7 @@ def test_query_sync_fts():
|
|||||||
},
|
},
|
||||||
"k": 10,
|
"k": 10,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
@@ -282,6 +410,7 @@ def test_query_sync_fts():
|
|||||||
"k": 42,
|
"k": 42,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
@@ -307,6 +436,7 @@ def test_query_sync_hybrid():
|
|||||||
"k": 42,
|
"k": 42,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
|
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
|
||||||
else:
|
else:
|
||||||
@@ -318,7 +448,9 @@ def test_query_sync_hybrid():
|
|||||||
"refine_factor": None,
|
"refine_factor": None,
|
||||||
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
"vector": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
|
"ef": None,
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})
|
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})
|
||||||
|
|
||||||
|
|||||||
@@ -195,6 +195,10 @@ impl VectorQuery {
|
|||||||
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
self.inner = self.inner.clone().nprobes(nprobe as usize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn ef(&mut self, ef: u32) {
|
||||||
|
self.inner = self.inner.clone().ef(ef as usize);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn bypass_vector_index(&mut self) {
|
pub fn bypass_vector_index(&mut self) {
|
||||||
self.inner = self.inner.clone().bypass_vector_index()
|
self.inner = self.inner.clone().bypass_vector_index()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use lancedb::table::{
|
|||||||
use pyo3::{
|
use pyo3::{
|
||||||
exceptions::{PyRuntimeError, PyValueError},
|
exceptions::{PyRuntimeError, PyValueError},
|
||||||
pyclass, pymethods,
|
pyclass, pymethods,
|
||||||
types::{PyDict, PyDictMethods, PyString},
|
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
|
||||||
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
||||||
};
|
};
|
||||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||||
@@ -246,6 +246,33 @@ impl Table {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn list_versions(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
|
let inner = self_.inner_ref()?.clone();
|
||||||
|
future_into_py(self_.py(), async move {
|
||||||
|
let versions = inner.list_versions().await.infer_error()?;
|
||||||
|
let versions_as_dict = Python::with_gil(|py| {
|
||||||
|
versions
|
||||||
|
.iter()
|
||||||
|
.map(|v| {
|
||||||
|
let dict = PyDict::new_bound(py);
|
||||||
|
dict.set_item("version", v.version).unwrap();
|
||||||
|
dict.set_item(
|
||||||
|
"timestamp",
|
||||||
|
v.timestamp.timestamp_nanos_opt().unwrap_or_default(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let tup: Vec<(&String, &String)> = v.metadata.iter().collect();
|
||||||
|
dict.set_item("metadata", tup.into_py_dict(py)).unwrap();
|
||||||
|
dict.to_object(py)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(versions_as_dict)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
|
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner_ref()?.clone();
|
let inner = self_.inner_ref()?.clone();
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.13.0-beta.2"
|
version = "0.13.1-beta.0"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.13.0-beta.2"
|
version = "0.13.1-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -48,9 +48,16 @@ async-openai = { version = "0.20.0", optional = true }
|
|||||||
serde_with = { version = "3.8.1" }
|
serde_with = { version = "3.8.1" }
|
||||||
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
|
aws-sdk-bedrockruntime = { version = "1.27.0", optional = true }
|
||||||
# For remote feature
|
# For remote feature
|
||||||
reqwest = { version = "0.12.0", features = ["gzip", "json", "stream"], optional = true }
|
reqwest = { version = "0.12.0", default-features = false, features = [
|
||||||
rand = { version = "0.8.3", features = ["small_rng"], optional = true}
|
"charset",
|
||||||
http = { version = "1", optional = true } # Matching what is in reqwest
|
"gzip",
|
||||||
|
"http2",
|
||||||
|
"json",
|
||||||
|
"macos-system-configuration",
|
||||||
|
"stream",
|
||||||
|
], optional = true }
|
||||||
|
rand = { version = "0.8.3", features = ["small_rng"], optional = true }
|
||||||
|
http = { version = "1", optional = true } # Matching what is in reqwest
|
||||||
uuid = { version = "1.7.0", features = ["v4"], optional = true }
|
uuid = { version = "1.7.0", features = ["v4"], optional = true }
|
||||||
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }
|
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }
|
||||||
polars = { version = ">=0.37,<0.40.0", optional = true }
|
polars = { version = ">=0.37,<0.40.0", optional = true }
|
||||||
@@ -75,7 +82,7 @@ http-body = "1" # Matching reqwest
|
|||||||
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = ["default-tls"]
|
||||||
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
remote = ["dep:reqwest", "dep:http", "dep:rand", "dep:uuid"]
|
||||||
fp16kernels = ["lance-linalg/fp16kernels"]
|
fp16kernels = ["lance-linalg/fp16kernels"]
|
||||||
s3-test = []
|
s3-test = []
|
||||||
@@ -90,6 +97,11 @@ sentence-transformers = [
|
|||||||
"dep:tokenizers"
|
"dep:tokenizers"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# TLS
|
||||||
|
default-tls = ["reqwest?/default-tls"]
|
||||||
|
native-tls = ["reqwest?/native-tls"]
|
||||||
|
rustls-tls = ["reqwest?/rustls-tls"]
|
||||||
|
|
||||||
[[example]]
|
[[example]]
|
||||||
name = "openai"
|
name = "openai"
|
||||||
required-features = ["openai"]
|
required-features = ["openai"]
|
||||||
|
|||||||
@@ -704,6 +704,9 @@ pub struct VectorQuery {
|
|||||||
// IVF PQ - ANN search.
|
// IVF PQ - ANN search.
|
||||||
pub(crate) query_vector: Vec<Arc<dyn Array>>,
|
pub(crate) query_vector: Vec<Arc<dyn Array>>,
|
||||||
pub(crate) nprobes: usize,
|
pub(crate) nprobes: usize,
|
||||||
|
// The number of candidates to return during the refine step for HNSW,
|
||||||
|
// defaults to 1.5 * limit.
|
||||||
|
pub(crate) ef: Option<usize>,
|
||||||
pub(crate) refine_factor: Option<u32>,
|
pub(crate) refine_factor: Option<u32>,
|
||||||
pub(crate) distance_type: Option<DistanceType>,
|
pub(crate) distance_type: Option<DistanceType>,
|
||||||
/// Default is true. Set to false to enforce a brute force search.
|
/// Default is true. Set to false to enforce a brute force search.
|
||||||
@@ -717,6 +720,7 @@ impl VectorQuery {
|
|||||||
column: None,
|
column: None,
|
||||||
query_vector: Vec::new(),
|
query_vector: Vec::new(),
|
||||||
nprobes: 20,
|
nprobes: 20,
|
||||||
|
ef: None,
|
||||||
refine_factor: None,
|
refine_factor: None,
|
||||||
distance_type: None,
|
distance_type: None,
|
||||||
use_index: true,
|
use_index: true,
|
||||||
@@ -776,6 +780,18 @@ impl VectorQuery {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Set the number of candidates to return during the refine step for HNSW
|
||||||
|
///
|
||||||
|
/// This argument is only used when the vector column has an HNSW index.
|
||||||
|
/// If there is no index then this value is ignored.
|
||||||
|
///
|
||||||
|
/// Increasing this value will increase the recall of your query but will
|
||||||
|
/// also increase the latency of your query. The default value is 1.5*limit.
|
||||||
|
pub fn ef(mut self, ef: usize) -> Self {
|
||||||
|
self.ef = Some(ef);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
/// A multiplier to control how many additional rows are taken during the refine step
|
/// A multiplier to control how many additional rows are taken during the refine step
|
||||||
///
|
///
|
||||||
/// This argument is only used when the vector column has an IVF PQ index.
|
/// This argument is only used when the vector column has an IVF PQ index.
|
||||||
|
|||||||
@@ -19,9 +19,10 @@ use http::header::CONTENT_TYPE;
|
|||||||
use http::StatusCode;
|
use http::StatusCode;
|
||||||
use lance::arrow::json::JsonSchema;
|
use lance::arrow::json::JsonSchema;
|
||||||
use lance::dataset::scanner::DatasetRecordBatchStream;
|
use lance::dataset::scanner::DatasetRecordBatchStream;
|
||||||
use lance::dataset::{ColumnAlteration, NewColumnTransform};
|
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
||||||
use lance_datafusion::exec::OneShotExec;
|
use lance_datafusion::exec::OneShotExec;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
connection::NoData,
|
connection::NoData,
|
||||||
@@ -43,17 +44,32 @@ pub struct RemoteTable<S: HttpSend = Sender> {
|
|||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
client: RestfulLanceDbClient<S>,
|
client: RestfulLanceDbClient<S>,
|
||||||
name: String,
|
name: String,
|
||||||
|
|
||||||
|
version: RwLock<Option<u64>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<S: HttpSend> RemoteTable<S> {
|
impl<S: HttpSend> RemoteTable<S> {
|
||||||
pub fn new(client: RestfulLanceDbClient<S>, name: String) -> Self {
|
pub fn new(client: RestfulLanceDbClient<S>, name: String) -> Self {
|
||||||
Self { client, name }
|
Self {
|
||||||
|
client,
|
||||||
|
name,
|
||||||
|
version: RwLock::new(None),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn describe(&self) -> Result<TableDescription> {
|
async fn describe(&self) -> Result<TableDescription> {
|
||||||
let request = self
|
let version = self.current_version().await;
|
||||||
|
self.describe_version(version).await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn describe_version(&self, version: Option<u64>) -> Result<TableDescription> {
|
||||||
|
let mut request = self
|
||||||
.client
|
.client
|
||||||
.post(&format!("/v1/table/{}/describe/", self.name));
|
.post(&format!("/v1/table/{}/describe/", self.name));
|
||||||
|
|
||||||
|
let body = serde_json::json!({ "version": version });
|
||||||
|
request = request.json(&body);
|
||||||
|
|
||||||
let (request_id, response) = self.client.send(request, true).await?;
|
let (request_id, response) = self.client.send(request, true).await?;
|
||||||
|
|
||||||
let response = self.check_table_response(&request_id, response).await?;
|
let response = self.check_table_response(&request_id, response).await?;
|
||||||
@@ -196,6 +212,7 @@ impl<S: HttpSend> RemoteTable<S> {
|
|||||||
body["prefilter"] = query.base.prefilter.into();
|
body["prefilter"] = query.base.prefilter.into();
|
||||||
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
body["distance_type"] = serde_json::json!(query.distance_type.unwrap_or_default());
|
||||||
body["nprobes"] = query.nprobes.into();
|
body["nprobes"] = query.nprobes.into();
|
||||||
|
body["ef"] = query.ef.into();
|
||||||
body["refine_factor"] = query.refine_factor.into();
|
body["refine_factor"] = query.refine_factor.into();
|
||||||
if let Some(vector_column) = query.column.as_ref() {
|
if let Some(vector_column) = query.column.as_ref() {
|
||||||
body["vector_column"] = serde_json::Value::String(vector_column.clone());
|
body["vector_column"] = serde_json::Value::String(vector_column.clone());
|
||||||
@@ -250,6 +267,24 @@ impl<S: HttpSend> RemoteTable<S> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn check_mutable(&self) -> Result<()> {
|
||||||
|
let read_guard = self.version.read().await;
|
||||||
|
match *read_guard {
|
||||||
|
None => Ok(()),
|
||||||
|
Some(version) => Err(Error::NotSupported {
|
||||||
|
message: format!(
|
||||||
|
"Cannot mutate table reference fixed at version {}. Call checkout_latest() to get a mutable table reference.",
|
||||||
|
version
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn current_version(&self) -> Option<u64> {
|
||||||
|
let read_guard = self.version.read().await;
|
||||||
|
*read_guard
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
@@ -277,7 +312,11 @@ mod test_utils {
|
|||||||
T: Into<reqwest::Body>,
|
T: Into<reqwest::Body>,
|
||||||
{
|
{
|
||||||
let client = client_with_handler(handler);
|
let client = client_with_handler(handler);
|
||||||
Self { client, name }
|
Self {
|
||||||
|
client,
|
||||||
|
name,
|
||||||
|
version: RwLock::new(None),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -296,21 +335,62 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
async fn version(&self) -> Result<u64> {
|
async fn version(&self) -> Result<u64> {
|
||||||
self.describe().await.map(|desc| desc.version)
|
self.describe().await.map(|desc| desc.version)
|
||||||
}
|
}
|
||||||
async fn checkout(&self, _version: u64) -> Result<()> {
|
async fn checkout(&self, version: u64) -> Result<()> {
|
||||||
Err(Error::NotSupported {
|
// check that the version exists
|
||||||
message: "checkout is not supported on LanceDB cloud.".into(),
|
self.describe_version(Some(version))
|
||||||
})
|
.await
|
||||||
|
.map_err(|e| match e {
|
||||||
|
// try to map the error to a more user-friendly error telling them
|
||||||
|
// specifically that the version does not exist
|
||||||
|
Error::TableNotFound { name } => Error::TableNotFound {
|
||||||
|
name: format!("{} (version: {})", name, version),
|
||||||
|
},
|
||||||
|
e => e,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut write_guard = self.version.write().await;
|
||||||
|
*write_guard = Some(version);
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
async fn checkout_latest(&self) -> Result<()> {
|
async fn checkout_latest(&self) -> Result<()> {
|
||||||
Err(Error::NotSupported {
|
let mut write_guard = self.version.write().await;
|
||||||
message: "checkout is not supported on LanceDB cloud.".into(),
|
*write_guard = None;
|
||||||
})
|
Ok(())
|
||||||
}
|
}
|
||||||
async fn restore(&self) -> Result<()> {
|
async fn restore(&self) -> Result<()> {
|
||||||
|
self.check_mutable().await?;
|
||||||
Err(Error::NotSupported {
|
Err(Error::NotSupported {
|
||||||
message: "restore is not supported on LanceDB cloud.".into(),
|
message: "restore is not supported on LanceDB cloud.".into(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
let request = self
|
||||||
|
.client
|
||||||
|
.post(&format!("/v1/table/{}/version/list/", self.name));
|
||||||
|
let (request_id, response) = self.client.send(request, true).await?;
|
||||||
|
let response = self.check_table_response(&request_id, response).await?;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ListVersionsResponse {
|
||||||
|
versions: Vec<Version>,
|
||||||
|
}
|
||||||
|
|
||||||
|
let body = response.text().await.err_to_http(request_id.clone())?;
|
||||||
|
let body: ListVersionsResponse =
|
||||||
|
serde_json::from_str(&body).map_err(|err| Error::Http {
|
||||||
|
source: format!(
|
||||||
|
"Failed to parse list_versions response: {}, body: {}",
|
||||||
|
err, body
|
||||||
|
)
|
||||||
|
.into(),
|
||||||
|
request_id,
|
||||||
|
status_code: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(body.versions)
|
||||||
|
}
|
||||||
|
|
||||||
async fn schema(&self) -> Result<SchemaRef> {
|
async fn schema(&self) -> Result<SchemaRef> {
|
||||||
let schema = self.describe().await?.schema;
|
let schema = self.describe().await?.schema;
|
||||||
Ok(Arc::new(schema.try_into()?))
|
Ok(Arc::new(schema.try_into()?))
|
||||||
@@ -320,10 +400,13 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
.client
|
.client
|
||||||
.post(&format!("/v1/table/{}/count_rows/", self.name));
|
.post(&format!("/v1/table/{}/count_rows/", self.name));
|
||||||
|
|
||||||
|
let version = self.current_version().await;
|
||||||
|
|
||||||
if let Some(filter) = filter {
|
if let Some(filter) = filter {
|
||||||
request = request.json(&serde_json::json!({ "predicate": filter }));
|
request = request.json(&serde_json::json!({ "predicate": filter, "version": version }));
|
||||||
} else {
|
} else {
|
||||||
request = request.json(&serde_json::json!({}));
|
let body = serde_json::json!({ "version": version });
|
||||||
|
request = request.json(&body);
|
||||||
}
|
}
|
||||||
|
|
||||||
let (request_id, response) = self.client.send(request, true).await?;
|
let (request_id, response) = self.client.send(request, true).await?;
|
||||||
@@ -343,6 +426,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
add: AddDataBuilder<NoData>,
|
add: AddDataBuilder<NoData>,
|
||||||
data: Box<dyn RecordBatchReader + Send>,
|
data: Box<dyn RecordBatchReader + Send>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
self.check_mutable().await?;
|
||||||
let body = Self::reader_as_body(data)?;
|
let body = Self::reader_as_body(data)?;
|
||||||
let mut request = self
|
let mut request = self
|
||||||
.client
|
.client
|
||||||
@@ -371,7 +455,8 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
) -> Result<Arc<dyn ExecutionPlan>> {
|
) -> Result<Arc<dyn ExecutionPlan>> {
|
||||||
let request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
let request = self.client.post(&format!("/v1/table/{}/query/", self.name));
|
||||||
|
|
||||||
let body = serde_json::Value::Object(Default::default());
|
let version = self.current_version().await;
|
||||||
|
let body = serde_json::json!({ "version": version });
|
||||||
let bodies = Self::apply_vector_query_params(body, query)?;
|
let bodies = Self::apply_vector_query_params(body, query)?;
|
||||||
|
|
||||||
let mut futures = Vec::with_capacity(bodies.len());
|
let mut futures = Vec::with_capacity(bodies.len());
|
||||||
@@ -406,7 +491,8 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
.post(&format!("/v1/table/{}/query/", self.name))
|
.post(&format!("/v1/table/{}/query/", self.name))
|
||||||
.header(CONTENT_TYPE, JSON_CONTENT_TYPE);
|
.header(CONTENT_TYPE, JSON_CONTENT_TYPE);
|
||||||
|
|
||||||
let mut body = serde_json::Value::Object(Default::default());
|
let version = self.current_version().await;
|
||||||
|
let mut body = serde_json::json!({ "version": version });
|
||||||
Self::apply_query_params(&mut body, query)?;
|
Self::apply_query_params(&mut body, query)?;
|
||||||
// Empty vector can be passed if no vector search is performed.
|
// Empty vector can be passed if no vector search is performed.
|
||||||
body["vector"] = serde_json::Value::Array(Vec::new());
|
body["vector"] = serde_json::Value::Array(Vec::new());
|
||||||
@@ -420,6 +506,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
Ok(DatasetRecordBatchStream::new(stream))
|
Ok(DatasetRecordBatchStream::new(stream))
|
||||||
}
|
}
|
||||||
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
|
async fn update(&self, update: UpdateBuilder) -> Result<u64> {
|
||||||
|
self.check_mutable().await?;
|
||||||
let request = self
|
let request = self
|
||||||
.client
|
.client
|
||||||
.post(&format!("/v1/table/{}/update/", self.name));
|
.post(&format!("/v1/table/{}/update/", self.name));
|
||||||
@@ -441,6 +528,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
Ok(0) // TODO: support returning number of modified rows once supported in SaaS.
|
Ok(0) // TODO: support returning number of modified rows once supported in SaaS.
|
||||||
}
|
}
|
||||||
async fn delete(&self, predicate: &str) -> Result<()> {
|
async fn delete(&self, predicate: &str) -> Result<()> {
|
||||||
|
self.check_mutable().await?;
|
||||||
let body = serde_json::json!({ "predicate": predicate });
|
let body = serde_json::json!({ "predicate": predicate });
|
||||||
let request = self
|
let request = self
|
||||||
.client
|
.client
|
||||||
@@ -452,6 +540,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn create_index(&self, mut index: IndexBuilder) -> Result<()> {
|
async fn create_index(&self, mut index: IndexBuilder) -> Result<()> {
|
||||||
|
self.check_mutable().await?;
|
||||||
let request = self
|
let request = self
|
||||||
.client
|
.client
|
||||||
.post(&format!("/v1/table/{}/create_index/", self.name));
|
.post(&format!("/v1/table/{}/create_index/", self.name));
|
||||||
@@ -530,6 +619,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
params: MergeInsertBuilder,
|
params: MergeInsertBuilder,
|
||||||
new_data: Box<dyn RecordBatchReader + Send>,
|
new_data: Box<dyn RecordBatchReader + Send>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
self.check_mutable().await?;
|
||||||
let query = MergeInsertRequest::try_from(params)?;
|
let query = MergeInsertRequest::try_from(params)?;
|
||||||
let body = Self::reader_as_body(new_data)?;
|
let body = Self::reader_as_body(new_data)?;
|
||||||
let request = self
|
let request = self
|
||||||
@@ -546,6 +636,7 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
async fn optimize(&self, _action: OptimizeAction) -> Result<OptimizeStats> {
|
async fn optimize(&self, _action: OptimizeAction) -> Result<OptimizeStats> {
|
||||||
|
self.check_mutable().await?;
|
||||||
Err(Error::NotSupported {
|
Err(Error::NotSupported {
|
||||||
message: "optimize is not supported on LanceDB cloud.".into(),
|
message: "optimize is not supported on LanceDB cloud.".into(),
|
||||||
})
|
})
|
||||||
@@ -555,16 +646,19 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
_transforms: NewColumnTransform,
|
_transforms: NewColumnTransform,
|
||||||
_read_columns: Option<Vec<String>>,
|
_read_columns: Option<Vec<String>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
self.check_mutable().await?;
|
||||||
Err(Error::NotSupported {
|
Err(Error::NotSupported {
|
||||||
message: "add_columns is not yet supported.".into(),
|
message: "add_columns is not yet supported.".into(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
async fn alter_columns(&self, _alterations: &[ColumnAlteration]) -> Result<()> {
|
async fn alter_columns(&self, _alterations: &[ColumnAlteration]) -> Result<()> {
|
||||||
|
self.check_mutable().await?;
|
||||||
Err(Error::NotSupported {
|
Err(Error::NotSupported {
|
||||||
message: "alter_columns is not yet supported.".into(),
|
message: "alter_columns is not yet supported.".into(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
async fn drop_columns(&self, _columns: &[&str]) -> Result<()> {
|
async fn drop_columns(&self, _columns: &[&str]) -> Result<()> {
|
||||||
|
self.check_mutable().await?;
|
||||||
Err(Error::NotSupported {
|
Err(Error::NotSupported {
|
||||||
message: "drop_columns is not yet supported.".into(),
|
message: "drop_columns is not yet supported.".into(),
|
||||||
})
|
})
|
||||||
@@ -572,9 +666,13 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
|
|
||||||
async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
||||||
// Make request to list the indices
|
// Make request to list the indices
|
||||||
let request = self
|
let mut request = self
|
||||||
.client
|
.client
|
||||||
.post(&format!("/v1/table/{}/index/list/", self.name));
|
.post(&format!("/v1/table/{}/index/list/", self.name));
|
||||||
|
let version = self.current_version().await;
|
||||||
|
let body = serde_json::json!({ "version": version });
|
||||||
|
request = request.json(&body);
|
||||||
|
|
||||||
let (request_id, response) = self.client.send(request, true).await?;
|
let (request_id, response) = self.client.send(request, true).await?;
|
||||||
let response = self.check_table_response(&request_id, response).await?;
|
let response = self.check_table_response(&request_id, response).await?;
|
||||||
|
|
||||||
@@ -624,10 +722,14 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>> {
|
async fn index_stats(&self, index_name: &str) -> Result<Option<IndexStatistics>> {
|
||||||
let request = self.client.post(&format!(
|
let mut request = self.client.post(&format!(
|
||||||
"/v1/table/{}/index/{}/stats/",
|
"/v1/table/{}/index/{}/stats/",
|
||||||
self.name, index_name
|
self.name, index_name
|
||||||
));
|
));
|
||||||
|
let version = self.current_version().await;
|
||||||
|
let body = serde_json::json!({ "version": version });
|
||||||
|
request = request.json(&body);
|
||||||
|
|
||||||
let (request_id, response) = self.client.send(request, true).await?;
|
let (request_id, response) = self.client.send(request, true).await?;
|
||||||
|
|
||||||
if response.status() == StatusCode::NOT_FOUND {
|
if response.status() == StatusCode::NOT_FOUND {
|
||||||
@@ -701,6 +803,7 @@ mod tests {
|
|||||||
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
||||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
||||||
use arrow_schema::{DataType, Field, Schema};
|
use arrow_schema::{DataType, Field, Schema};
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||||
use lance_index::scalar::FullTextSearchQuery;
|
use lance_index::scalar::FullTextSearchQuery;
|
||||||
use reqwest::Body;
|
use reqwest::Body;
|
||||||
@@ -805,7 +908,10 @@ mod tests {
|
|||||||
request.headers().get("Content-Type").unwrap(),
|
request.headers().get("Content-Type").unwrap(),
|
||||||
JSON_CONTENT_TYPE
|
JSON_CONTENT_TYPE
|
||||||
);
|
);
|
||||||
assert_eq!(request.body().unwrap().as_bytes().unwrap(), br#"{}"#);
|
assert_eq!(
|
||||||
|
request.body().unwrap().as_bytes().unwrap(),
|
||||||
|
br#"{"version":null}"#
|
||||||
|
);
|
||||||
|
|
||||||
http::Response::builder().status(200).body("42").unwrap()
|
http::Response::builder().status(200).body("42").unwrap()
|
||||||
});
|
});
|
||||||
@@ -822,7 +928,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
request.body().unwrap().as_bytes().unwrap(),
|
request.body().unwrap().as_bytes().unwrap(),
|
||||||
br#"{"predicate":"a > 10"}"#
|
br#"{"predicate":"a > 10","version":null}"#
|
||||||
);
|
);
|
||||||
|
|
||||||
http::Response::builder().status(200).body("42").unwrap()
|
http::Response::builder().status(200).body("42").unwrap()
|
||||||
@@ -1121,7 +1227,9 @@ mod tests {
|
|||||||
"prefilter": true,
|
"prefilter": true,
|
||||||
"distance_type": "l2",
|
"distance_type": "l2",
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
|
"ef": Option::<usize>::None,
|
||||||
"refine_factor": null,
|
"refine_factor": null,
|
||||||
|
"version": null,
|
||||||
});
|
});
|
||||||
// Pass vector separately to make sure it matches f32 precision.
|
// Pass vector separately to make sure it matches f32 precision.
|
||||||
expected_body["vector"] = vec![0.1f32, 0.2, 0.3].into();
|
expected_body["vector"] = vec![0.1f32, 0.2, 0.3].into();
|
||||||
@@ -1166,7 +1274,9 @@ mod tests {
|
|||||||
"bypass_vector_index": true,
|
"bypass_vector_index": true,
|
||||||
"columns": ["a", "b"],
|
"columns": ["a", "b"],
|
||||||
"nprobes": 12,
|
"nprobes": 12,
|
||||||
|
"ef": Option::<usize>::None,
|
||||||
"refine_factor": 2,
|
"refine_factor": 2,
|
||||||
|
"version": null,
|
||||||
});
|
});
|
||||||
// Pass vector separately to make sure it matches f32 precision.
|
// Pass vector separately to make sure it matches f32 precision.
|
||||||
expected_body["vector"] = vec![0.1f32, 0.2, 0.3].into();
|
expected_body["vector"] = vec![0.1f32, 0.2, 0.3].into();
|
||||||
@@ -1222,6 +1332,7 @@ mod tests {
|
|||||||
"k": 10,
|
"k": 10,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"with_row_id": true,
|
"with_row_id": true,
|
||||||
|
"version": null
|
||||||
});
|
});
|
||||||
assert_eq!(body, expected_body);
|
assert_eq!(body, expected_body);
|
||||||
|
|
||||||
@@ -1407,6 +1518,51 @@ mod tests {
|
|||||||
assert_eq!(indices, expected);
|
assert_eq!(indices, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_list_versions() {
|
||||||
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
|
assert_eq!(request.method(), "POST");
|
||||||
|
assert_eq!(request.url().path(), "/v1/table/my_table/version/list/");
|
||||||
|
|
||||||
|
let version1 = lance::dataset::Version {
|
||||||
|
version: 1,
|
||||||
|
timestamp: "2024-01-01T00:00:00Z".parse().unwrap(),
|
||||||
|
metadata: Default::default(),
|
||||||
|
};
|
||||||
|
let version2 = lance::dataset::Version {
|
||||||
|
version: 2,
|
||||||
|
timestamp: "2024-02-01T00:00:00Z".parse().unwrap(),
|
||||||
|
metadata: Default::default(),
|
||||||
|
};
|
||||||
|
let response_body = serde_json::json!({
|
||||||
|
"versions": [
|
||||||
|
version1,
|
||||||
|
version2,
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let response_body = serde_json::to_string(&response_body).unwrap();
|
||||||
|
|
||||||
|
http::Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.body(response_body)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
let versions = table.list_versions().await.unwrap();
|
||||||
|
assert_eq!(versions.len(), 2);
|
||||||
|
assert_eq!(versions[0].version, 1);
|
||||||
|
assert_eq!(
|
||||||
|
versions[0].timestamp,
|
||||||
|
"2024-01-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
|
||||||
|
);
|
||||||
|
assert_eq!(versions[1].version, 2);
|
||||||
|
assert_eq!(
|
||||||
|
versions[1].timestamp,
|
||||||
|
"2024-02-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
|
||||||
|
);
|
||||||
|
// assert_eq!(versions, expected);
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_index_stats() {
|
async fn test_index_stats() {
|
||||||
let table = Table::new_with_handler("my_table", |request| {
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
@@ -1451,4 +1607,195 @@ mod tests {
|
|||||||
let indices = table.index_stats("my_index").await.unwrap();
|
let indices = table.index_stats("my_index").await.unwrap();
|
||||||
assert!(indices.is_none());
|
assert!(indices.is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_passes_version() {
|
||||||
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
|
let body = request.body().unwrap().as_bytes().unwrap();
|
||||||
|
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||||
|
let version = body
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
.get("version")
|
||||||
|
.unwrap()
|
||||||
|
.as_u64()
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(version, 42);
|
||||||
|
|
||||||
|
let response_body = match request.url().path() {
|
||||||
|
"/v1/table/my_table/describe/" => {
|
||||||
|
serde_json::json!({
|
||||||
|
"version": 42,
|
||||||
|
"schema": { "fields": [] }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
"/v1/table/my_table/index/list/" => {
|
||||||
|
serde_json::json!({
|
||||||
|
"indexes": []
|
||||||
|
})
|
||||||
|
}
|
||||||
|
"/v1/table/my_table/index/my_idx/stats/" => {
|
||||||
|
serde_json::json!({
|
||||||
|
"num_indexed_rows": 100000,
|
||||||
|
"num_unindexed_rows": 0,
|
||||||
|
"index_type": "IVF_PQ",
|
||||||
|
"distance_type": "l2"
|
||||||
|
})
|
||||||
|
}
|
||||||
|
"/v1/table/my_table/count_rows/" => {
|
||||||
|
serde_json::json!(1000)
|
||||||
|
}
|
||||||
|
"/v1/table/my_table/query/" => {
|
||||||
|
let expected_data = RecordBatch::try_new(
|
||||||
|
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||||
|
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let expected_data_ref = expected_data.clone();
|
||||||
|
let response_body = write_ipc_file(&expected_data_ref);
|
||||||
|
return http::Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.header(CONTENT_TYPE, ARROW_FILE_CONTENT_TYPE)
|
||||||
|
.body(response_body)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
path => panic!("Unexpected path: {}", path),
|
||||||
|
};
|
||||||
|
|
||||||
|
http::Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.body(
|
||||||
|
serde_json::to_string(&response_body)
|
||||||
|
.unwrap()
|
||||||
|
.as_bytes()
|
||||||
|
.to_vec(),
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
table.checkout(42).await.unwrap();
|
||||||
|
|
||||||
|
// ensure that version is passed to the /describe endpoint
|
||||||
|
let version = table.version().await.unwrap();
|
||||||
|
assert_eq!(version, 42);
|
||||||
|
|
||||||
|
// ensure it's passed to other read API calls
|
||||||
|
table.list_indices().await.unwrap();
|
||||||
|
table.index_stats("my_idx").await.unwrap();
|
||||||
|
table.count_rows(None).await.unwrap();
|
||||||
|
table
|
||||||
|
.query()
|
||||||
|
.nearest_to(vec![0.1, 0.2, 0.3])
|
||||||
|
.unwrap()
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_fails_if_checkout_version_doesnt_exist() {
|
||||||
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
|
let body = request.body().unwrap().as_bytes().unwrap();
|
||||||
|
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||||
|
let version = body
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
.get("version")
|
||||||
|
.unwrap()
|
||||||
|
.as_u64()
|
||||||
|
.unwrap();
|
||||||
|
if version != 42 {
|
||||||
|
return http::Response::builder()
|
||||||
|
.status(404)
|
||||||
|
.body(format!("Table my_table (version: {}) not found", version))
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let response_body = match request.url().path() {
|
||||||
|
"/v1/table/my_table/describe/" => {
|
||||||
|
serde_json::json!({
|
||||||
|
"version": 42,
|
||||||
|
"schema": { "fields": [] }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
_ => panic!("Unexpected path"),
|
||||||
|
};
|
||||||
|
|
||||||
|
http::Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.body(serde_json::to_string(&response_body).unwrap())
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
let res = table.checkout(43).await;
|
||||||
|
println!("{:?}", res);
|
||||||
|
assert!(
|
||||||
|
matches!(res, Err(Error::TableNotFound { name }) if name == "my_table (version: 43)")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_timetravel_immutable() {
|
||||||
|
let table = Table::new_with_handler::<String>("my_table", |request| {
|
||||||
|
let response_body = match request.url().path() {
|
||||||
|
"/v1/table/my_table/describe/" => {
|
||||||
|
serde_json::json!({
|
||||||
|
"version": 42,
|
||||||
|
"schema": { "fields": [] }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
_ => panic!("Should not have made a request: {:?}", request),
|
||||||
|
};
|
||||||
|
|
||||||
|
http::Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.body(serde_json::to_string(&response_body).unwrap())
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
table.checkout(42).await.unwrap();
|
||||||
|
|
||||||
|
// Ensure that all mutable operations fail.
|
||||||
|
let res = table
|
||||||
|
.update()
|
||||||
|
.column("a", "a + 1")
|
||||||
|
.column("b", "b - 1")
|
||||||
|
.only_if("b > 10")
|
||||||
|
.execute()
|
||||||
|
.await;
|
||||||
|
assert!(matches!(res, Err(Error::NotSupported { .. })));
|
||||||
|
|
||||||
|
let batch = RecordBatch::try_new(
|
||||||
|
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||||
|
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let data = Box::new(RecordBatchIterator::new(
|
||||||
|
[Ok(batch.clone())],
|
||||||
|
batch.schema(),
|
||||||
|
));
|
||||||
|
let res = table.merge_insert(&["some_col"]).execute(data).await;
|
||||||
|
assert!(matches!(res, Err(Error::NotSupported { .. })));
|
||||||
|
|
||||||
|
let res = table.delete("id in (1, 2, 3)").await;
|
||||||
|
assert!(matches!(res, Err(Error::NotSupported { .. })));
|
||||||
|
|
||||||
|
let data = RecordBatch::try_new(
|
||||||
|
Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])),
|
||||||
|
vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let res = table
|
||||||
|
.add(RecordBatchIterator::new([Ok(data.clone())], data.schema()))
|
||||||
|
.execute()
|
||||||
|
.await;
|
||||||
|
assert!(matches!(res, Err(Error::NotSupported { .. })));
|
||||||
|
|
||||||
|
let res = table
|
||||||
|
.create_index(&["a"], Index::IvfPq(Default::default()))
|
||||||
|
.execute()
|
||||||
|
.await;
|
||||||
|
assert!(matches!(res, Err(Error::NotSupported { .. })));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ pub use lance::dataset::ColumnAlteration;
|
|||||||
pub use lance::dataset::NewColumnTransform;
|
pub use lance::dataset::NewColumnTransform;
|
||||||
pub use lance::dataset::ReadParams;
|
pub use lance::dataset::ReadParams;
|
||||||
use lance::dataset::{
|
use lance::dataset::{
|
||||||
Dataset, UpdateBuilder as LanceUpdateBuilder, WhenMatched, WriteMode, WriteParams,
|
Dataset, UpdateBuilder as LanceUpdateBuilder, Version, WhenMatched, WriteMode, WriteParams,
|
||||||
};
|
};
|
||||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||||
use lance::io::WrappingObjectStore;
|
use lance::io::WrappingObjectStore;
|
||||||
@@ -426,6 +426,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
|
|||||||
async fn checkout(&self, version: u64) -> Result<()>;
|
async fn checkout(&self, version: u64) -> Result<()>;
|
||||||
async fn checkout_latest(&self) -> Result<()>;
|
async fn checkout_latest(&self) -> Result<()>;
|
||||||
async fn restore(&self) -> Result<()>;
|
async fn restore(&self) -> Result<()>;
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>>;
|
||||||
async fn table_definition(&self) -> Result<TableDefinition>;
|
async fn table_definition(&self) -> Result<TableDefinition>;
|
||||||
fn dataset_uri(&self) -> &str;
|
fn dataset_uri(&self) -> &str;
|
||||||
}
|
}
|
||||||
@@ -955,6 +956,11 @@ impl Table {
|
|||||||
self.inner.restore().await
|
self.inner.restore().await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// List all the versions of the table
|
||||||
|
pub async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
self.inner.list_versions().await
|
||||||
|
}
|
||||||
|
|
||||||
/// List all indices that have been created with [`Self::create_index`]
|
/// List all indices that have been created with [`Self::create_index`]
|
||||||
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
||||||
self.inner.list_indices().await
|
self.inner.list_indices().await
|
||||||
@@ -1319,7 +1325,7 @@ impl NativeTable {
|
|||||||
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
||||||
Ok(indices
|
Ok(indices
|
||||||
.iter()
|
.iter()
|
||||||
.map(|i| VectorIndex::new_from_format(&mf, i))
|
.map(|i| VectorIndex::new_from_format(&(mf.0), i))
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1707,6 +1713,10 @@ impl TableInternal for NativeTable {
|
|||||||
self.dataset.reload().await
|
self.dataset.reload().await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
Ok(self.dataset.get().await?.versions().await?)
|
||||||
|
}
|
||||||
|
|
||||||
async fn restore(&self) -> Result<()> {
|
async fn restore(&self) -> Result<()> {
|
||||||
let version =
|
let version =
|
||||||
self.dataset
|
self.dataset
|
||||||
@@ -1904,6 +1914,9 @@ impl TableInternal for NativeTable {
|
|||||||
query.base.offset.map(|offset| offset as i64),
|
query.base.offset.map(|offset| offset as i64),
|
||||||
)?;
|
)?;
|
||||||
scanner.nprobs(query.nprobes);
|
scanner.nprobs(query.nprobes);
|
||||||
|
if let Some(ef) = query.ef {
|
||||||
|
scanner.ef(ef);
|
||||||
|
}
|
||||||
scanner.use_index(query.use_index);
|
scanner.use_index(query.use_index);
|
||||||
scanner.prefilter(query.base.prefilter);
|
scanner.prefilter(query.base.prefilter);
|
||||||
match query.base.select {
|
match query.base.select {
|
||||||
|
|||||||
Reference in New Issue
Block a user