mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 21:39:57 +00:00
Compare commits
10 Commits
remote-ver
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
96933d7df8 | ||
|
|
d369233b3d | ||
|
|
43a670ed4b | ||
|
|
cb9a00a28d | ||
|
|
72af977a73 | ||
|
|
7cecb71df0 | ||
|
|
285071e5c8 | ||
|
|
114866fbcf | ||
|
|
5387c0e243 | ||
|
|
53d1535de1 |
@@ -87,6 +87,16 @@ glob = "node/package.json"
|
|||||||
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
||||||
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
glob = "node/package.json"
|
||||||
|
replace = "\"@lancedb/vectordb-linux-arm64-musl\": \"{new_version}\""
|
||||||
|
search = "\"@lancedb/vectordb-linux-arm64-musl\": \"{current_version}\""
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
glob = "node/package.json"
|
||||||
|
replace = "\"@lancedb/vectordb-linux-x64-musl\": \"{new_version}\""
|
||||||
|
search = "\"@lancedb/vectordb-linux-x64-musl\": \"{current_version}\""
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
[[tool.bumpversion.files]]
|
||||||
glob = "node/package.json"
|
glob = "node/package.json"
|
||||||
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
||||||
|
|||||||
@@ -31,6 +31,9 @@ rustflags = [
|
|||||||
[target.x86_64-unknown-linux-gnu]
|
[target.x86_64-unknown-linux-gnu]
|
||||||
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
|
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
|
||||||
|
|
||||||
|
[target.x86_64-unknown-linux-musl]
|
||||||
|
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=-crt-static,+avx2,+fma,+f16c"]
|
||||||
|
|
||||||
[target.aarch64-apple-darwin]
|
[target.aarch64-apple-darwin]
|
||||||
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]
|
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]
|
||||||
|
|
||||||
|
|||||||
120
.github/workflows/npm-publish.yml
vendored
120
.github/workflows/npm-publish.yml
vendored
@@ -101,7 +101,7 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
|
|
||||||
node-linux:
|
node-linux-gnu:
|
||||||
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
@@ -137,11 +137,63 @@ jobs:
|
|||||||
- name: Upload Linux Artifacts
|
- name: Upload Linux Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: node-native-linux-${{ matrix.config.arch }}
|
name: node-native-linux-${{ matrix.config.arch }}-gnu
|
||||||
path: |
|
path: |
|
||||||
node/dist/lancedb-vectordb-linux*.tgz
|
node/dist/lancedb-vectordb-linux*.tgz
|
||||||
|
|
||||||
nodejs-linux:
|
node-linux-musl:
|
||||||
|
name: vectordb (${{ matrix.config.arch}}-unknown-linux-musl)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: alpine:edge
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- arch: x86_64
|
||||||
|
- arch: aarch64
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install common dependencies
|
||||||
|
run: |
|
||||||
|
apk add protobuf-dev curl clang mold grep npm bash
|
||||||
|
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
|
||||||
|
echo "source $HOME/.cargo/env" >> saved_env
|
||||||
|
echo "export CC=clang" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
|
||||||
|
- name: Configure aarch64 build
|
||||||
|
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||||
|
run: |
|
||||||
|
source "$HOME/.cargo/env"
|
||||||
|
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
|
||||||
|
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
|
||||||
|
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
|
||||||
|
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
|
||||||
|
curl -sSf $apk_url > apk_list
|
||||||
|
for pkg in gcc libgcc musl; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
|
||||||
|
mkdir -p $sysroot_lib
|
||||||
|
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
|
||||||
|
cp usr/lib/libgcc_s.so.1 $sysroot_lib
|
||||||
|
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
|
||||||
|
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
|
||||||
|
echo '!<arch>' > $sysroot_lib/libdl.a
|
||||||
|
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
|
||||||
|
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=apple-m1 -Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
|
||||||
|
- name: Build Linux Artifacts
|
||||||
|
run: |
|
||||||
|
source ./saved_env
|
||||||
|
bash ci/manylinux_node/build_vectordb.sh ${{ matrix.config.arch }}
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: node-native-linux-${{ matrix.config.arch }}-musl
|
||||||
|
path: |
|
||||||
|
node/dist/lancedb-vectordb-linux*.tgz
|
||||||
|
|
||||||
|
nodejs-linux-gnu:
|
||||||
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
|
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
@@ -178,7 +230,7 @@ jobs:
|
|||||||
- name: Upload Linux Artifacts
|
- name: Upload Linux Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: nodejs-native-linux-${{ matrix.config.arch }}
|
name: nodejs-native-linux-${{ matrix.config.arch }}-gnu
|
||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
# The generic files are the same in all distros so we just pick
|
# The generic files are the same in all distros so we just pick
|
||||||
@@ -192,6 +244,62 @@ jobs:
|
|||||||
nodejs/dist/*
|
nodejs/dist/*
|
||||||
!nodejs/dist/*.node
|
!nodejs/dist/*.node
|
||||||
|
|
||||||
|
nodejs-linux-musl:
|
||||||
|
name: lancedb (${{ matrix.config.arch}}-unknown-linux-musl
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: alpine:edge
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- arch: x86_64
|
||||||
|
- arch: aarch64
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install common dependencies
|
||||||
|
run: |
|
||||||
|
apk add protobuf-dev curl clang mold grep npm bash openssl-dev openssl-libs-static
|
||||||
|
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
|
||||||
|
echo "source $HOME/.cargo/env" >> saved_env
|
||||||
|
echo "export CC=clang" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
|
||||||
|
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=/usr/include" >> saved_env
|
||||||
|
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=/usr/lib" >> saved_env
|
||||||
|
- name: Configure aarch64 build
|
||||||
|
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||||
|
run: |
|
||||||
|
source "$HOME/.cargo/env"
|
||||||
|
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
|
||||||
|
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
|
||||||
|
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
|
||||||
|
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
|
||||||
|
curl -sSf $apk_url > apk_list
|
||||||
|
for pkg in gcc libgcc musl openssl-dev openssl-libs-static; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
|
||||||
|
mkdir -p $sysroot_lib
|
||||||
|
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
|
||||||
|
cp usr/lib/libgcc_s.so.1 $sysroot_lib
|
||||||
|
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
|
||||||
|
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
|
||||||
|
echo '!<arch>' > $sysroot_lib/libdl.a
|
||||||
|
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
|
||||||
|
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
|
||||||
|
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=$(realpath usr/include)" >> saved_env
|
||||||
|
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=$(realpath usr/lib)" >> saved_env
|
||||||
|
- name: Build Linux Artifacts
|
||||||
|
run: |
|
||||||
|
source ./saved_env
|
||||||
|
bash ci/manylinux_node/build_lancedb.sh ${{ matrix.config.arch }}
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: nodejs-native-linux-${{ matrix.config.arch }}-musl
|
||||||
|
path: |
|
||||||
|
nodejs/dist/*.node
|
||||||
|
|
||||||
node-windows:
|
node-windows:
|
||||||
name: vectordb ${{ matrix.target }}
|
name: vectordb ${{ matrix.target }}
|
||||||
runs-on: windows-2022
|
runs-on: windows-2022
|
||||||
@@ -460,7 +568,7 @@ jobs:
|
|||||||
|
|
||||||
release:
|
release:
|
||||||
name: vectordb NPM Publish
|
name: vectordb NPM Publish
|
||||||
needs: [node, node-macos, node-linux, node-windows]
|
needs: [node, node-macos, node-linux-gnu, node-linux-musl, node-windows]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
@@ -500,7 +608,7 @@ jobs:
|
|||||||
|
|
||||||
release-nodejs:
|
release-nodejs:
|
||||||
name: lancedb NPM Publish
|
name: lancedb NPM Publish
|
||||||
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
|
needs: [nodejs-macos, nodejs-linux-gnu, nodejs-linux-musl, nodejs-windows]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
|||||||
16
Cargo.toml
16
Cargo.toml
@@ -21,15 +21,15 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.19.3", "features" = [
|
lance = { "version" = "=0.20.0", "features" = [
|
||||||
"dynamodb",
|
"dynamodb",
|
||||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-index = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-linalg = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-table = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-testing = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-datafusion = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-encoding = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "52.2", optional = false }
|
arrow = { version = "52.2", optional = false }
|
||||||
arrow-array = "52.2"
|
arrow-array = "52.2"
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ fi
|
|||||||
export OPENSSL_STATIC=1
|
export OPENSSL_STATIC=1
|
||||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||||
|
|
||||||
source $HOME/.bashrc
|
#Alpine doesn't have .bashrc
|
||||||
|
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||||
|
|
||||||
cd nodejs
|
cd nodejs
|
||||||
npm ci
|
npm ci
|
||||||
|
|||||||
@@ -5,13 +5,14 @@ ARCH=${1:-x86_64}
|
|||||||
|
|
||||||
if [ "$ARCH" = "x86_64" ]; then
|
if [ "$ARCH" = "x86_64" ]; then
|
||||||
export OPENSSL_LIB_DIR=/usr/local/lib64/
|
export OPENSSL_LIB_DIR=/usr/local/lib64/
|
||||||
else
|
else
|
||||||
export OPENSSL_LIB_DIR=/usr/local/lib/
|
export OPENSSL_LIB_DIR=/usr/local/lib/
|
||||||
fi
|
fi
|
||||||
export OPENSSL_STATIC=1
|
export OPENSSL_STATIC=1
|
||||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||||
|
|
||||||
source $HOME/.bashrc
|
#Alpine doesn't have .bashrc
|
||||||
|
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||||
|
|
||||||
cd node
|
cd node
|
||||||
npm ci
|
npm ci
|
||||||
|
|||||||
@@ -138,6 +138,7 @@ nav:
|
|||||||
- Jina Reranker: reranking/jina.md
|
- Jina Reranker: reranking/jina.md
|
||||||
- OpenAI Reranker: reranking/openai.md
|
- OpenAI Reranker: reranking/openai.md
|
||||||
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
||||||
|
- Voyage AI Rerankers: reranking/voyageai.md
|
||||||
- Building Custom Rerankers: reranking/custom_reranker.md
|
- Building Custom Rerankers: reranking/custom_reranker.md
|
||||||
- Example: notebooks/lancedb_reranking.ipynb
|
- Example: notebooks/lancedb_reranking.ipynb
|
||||||
- Filtering: sql.md
|
- Filtering: sql.md
|
||||||
@@ -165,6 +166,7 @@ nav:
|
|||||||
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
||||||
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
||||||
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
||||||
|
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
|
||||||
- Multimodal Embedding Functions:
|
- Multimodal Embedding Functions:
|
||||||
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
||||||
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
||||||
|
|||||||
@@ -277,7 +277,15 @@ Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` t
|
|||||||
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
||||||
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
||||||
|
|
||||||
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because
|
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. The number should be a factor of the vector dimension. Because
|
||||||
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
||||||
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
|
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
||||||
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
|
||||||
|
!!! note
|
||||||
|
if `num_sub_vectors` is set to be greater than the vector dimension, you will see errors like `attempt to divide by zero`
|
||||||
|
|
||||||
|
### How to choose `m` and `ef_construction` for `IVF_HNSW_*` index?
|
||||||
|
|
||||||
|
`m` determines the number of connections a new node establishes with its closest neighbors upon entering the graph. Typically, `m` falls within the range of 5 to 48. Lower `m` values are suitable for low-dimensional data or scenarios where recall is less critical. Conversely, higher `m` values are beneficial for high-dimensional data or when high recall is required. In essence, a larger `m` results in a denser graph with increased connectivity, but at the expense of higher memory consumption.
|
||||||
|
|
||||||
|
`ef_construction` balances build speed and accuracy. Higher values increase accuracy but slow down the build process. A typical range is 150 to 300. For good search results, a minimum value of 100 is recommended. In most cases, setting this value above 500 offers no additional benefit. Ensure that `ef_construction` is always set to a value equal to or greater than `ef` in the search phase
|
||||||
@@ -57,6 +57,13 @@ Then the greedy search routine operates as follows:
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
There are three key parameters to set when constructing an HNSW index:
|
||||||
|
|
||||||
|
* `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
|
||||||
|
* `m`: The number of neighbors to select for each vector in the HNSW graph.
|
||||||
|
* `ef_construction`: The number of candidates to evaluate during the construction of the HNSW graph.
|
||||||
|
|
||||||
|
|
||||||
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
|
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
|
||||||
|
|
||||||
### Construct index
|
### Construct index
|
||||||
|
|||||||
@@ -58,8 +58,10 @@ In Python, the index can be created as follows:
|
|||||||
# Make sure you have enough data in the table for an effective training step
|
# Make sure you have enough data in the table for an effective training step
|
||||||
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
|
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
|
||||||
```
|
```
|
||||||
|
!!! note
|
||||||
|
`num_partitions`=256 and `num_sub_vectors`=96 does not work for every dataset. Those values needs to be adjusted for your particular dataset.
|
||||||
|
|
||||||
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See the [FAQs](#faq) below for best practices on choosing these parameters.
|
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See [here](../ann_indexes.md/#how-to-choose-num_partitions-and-num_sub_vectors-for-ivf_pq-index) for best practices on choosing these parameters.
|
||||||
|
|
||||||
|
|
||||||
### Query the index
|
### Query the index
|
||||||
|
|||||||
@@ -114,12 +114,45 @@ table.create_fts_index("text",
|
|||||||
|
|
||||||
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
|
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
|
||||||
|
|
||||||
This can be invoked via the familiar `where` syntax:
|
This can be invoked via the familiar `where` syntax.
|
||||||
|
|
||||||
|
With pre-filtering:
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
table.search("puppy").limit(10).where("meta='foo'").to_list()
|
table.search("puppy").limit(10).where("meta='foo'", prefilte=True).to_list()
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "TypeScript"
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
await tbl
|
||||||
|
.search("puppy")
|
||||||
|
.select(["id", "doc"])
|
||||||
|
.limit(10)
|
||||||
|
.where("meta='foo'")
|
||||||
|
.prefilter(true)
|
||||||
|
.toArray();
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Rust"
|
||||||
|
|
||||||
|
```rust
|
||||||
|
table
|
||||||
|
.query()
|
||||||
|
.full_text_search(FullTextSearchQuery::new("puppy".to_owned()))
|
||||||
|
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
||||||
|
.limit(10)
|
||||||
|
.only_if("meta='foo'")
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
```
|
||||||
|
|
||||||
|
With post-filtering:
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.search("puppy").limit(10).where("meta='foo'", prefilte=False).to_list()
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "TypeScript"
|
=== "TypeScript"
|
||||||
@@ -130,6 +163,7 @@ This can be invoked via the familiar `where` syntax:
|
|||||||
.select(["id", "doc"])
|
.select(["id", "doc"])
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.where("meta='foo'")
|
.where("meta='foo'")
|
||||||
|
.prefilter(false)
|
||||||
.toArray();
|
.toArray();
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -140,6 +174,7 @@ This can be invoked via the familiar `where` syntax:
|
|||||||
.query()
|
.query()
|
||||||
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
|
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
|
||||||
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
||||||
|
.postfilter()
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.only_if("meta='foo'")
|
.only_if("meta='foo'")
|
||||||
.execute()
|
.execute()
|
||||||
@@ -189,3 +224,6 @@ This can make the query more efficient, especially when the table is large and t
|
|||||||
tbl.add(more_data).execute().await?;
|
tbl.add(more_data).execute().await?;
|
||||||
tbl.optimize(OptimizeAction::All).execute().await?;
|
tbl.optimize(OptimizeAction::All).execute().await?;
|
||||||
```
|
```
|
||||||
|
!!! note
|
||||||
|
|
||||||
|
New data added after creating the FTS index will appear in search results while incremental index is still progress, but with increased latency due to a flat search on the unindexed portion. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
|
||||||
@@ -153,9 +153,7 @@ table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=
|
|||||||
|
|
||||||
## Current limitations
|
## Current limitations
|
||||||
|
|
||||||
1. Currently we do not yet support incremental writes.
|
1. New data added after creating the FTS index will appear in search results, but with increased latency due to a flat search on the unindexed portion. Re-indexing with `create_fts_index` will reduce latency. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
|
||||||
If you add data after FTS index creation, it won't be reflected
|
|
||||||
in search results until you do a full reindex.
|
|
||||||
|
|
||||||
2. We currently only support local filesystem paths for the FTS index.
|
2. We currently only support local filesystem paths for the FTS index.
|
||||||
This is a tantivy limitation. We've implemented an object store plugin
|
This is a tantivy limitation. We've implemented an object store plugin
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ This re-ranker uses the [Cohere](https://cohere.ai/) API to rerank the search re
|
|||||||
!!! note
|
!!! note
|
||||||
Supported Query Types: Hybrid, Vector, FTS
|
Supported Query Types: Hybrid, Vector, FTS
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install cohere
|
||||||
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import numpy
|
import numpy
|
||||||
|
|||||||
@@ -89,10 +89,12 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0",
|
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0",
|
"@lancedb/vectordb-darwin-x64": "0.13.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
|
"@lancedb/vectordb-darwin-arm64": "0.13.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
|
||||||
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
|
||||||
|
"@lancedb/vectordb-linux-x64-musl": "0.13.0",
|
||||||
|
"@lancedb/vectordb-linux-arm64-musl": "0.13.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0",
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.0",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0"
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -87,6 +87,12 @@ export interface OptimizeOptions {
|
|||||||
deleteUnverified: boolean;
|
deleteUnverified: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface Version {
|
||||||
|
version: number;
|
||||||
|
timestamp: Date;
|
||||||
|
metadata: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A Table is a collection of Records in a LanceDB Database.
|
* A Table is a collection of Records in a LanceDB Database.
|
||||||
*
|
*
|
||||||
@@ -360,6 +366,11 @@ export abstract class Table {
|
|||||||
*/
|
*/
|
||||||
abstract checkoutLatest(): Promise<void>;
|
abstract checkoutLatest(): Promise<void>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List all the versions of the table
|
||||||
|
*/
|
||||||
|
abstract listVersions(): Promise<Version[]>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Restore the table to the currently checked out version
|
* Restore the table to the currently checked out version
|
||||||
*
|
*
|
||||||
@@ -659,6 +670,14 @@ export class LocalTable extends Table {
|
|||||||
await this.inner.checkoutLatest();
|
await this.inner.checkoutLatest();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async listVersions(): Promise<Version[]> {
|
||||||
|
return (await this.inner.listVersions()).map((version) => ({
|
||||||
|
version: version.version,
|
||||||
|
timestamp: new Date(version.timestamp / 1000),
|
||||||
|
metadata: version.metadata,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
async restore(): Promise<void> {
|
async restore(): Promise<void> {
|
||||||
await this.inner.restore();
|
await this.inner.restore();
|
||||||
}
|
}
|
||||||
|
|||||||
3
nodejs/npm/linux-arm64-musl/README.md
Normal file
3
nodejs/npm/linux-arm64-musl/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# `@lancedb/lancedb-linux-arm64-musl`
|
||||||
|
|
||||||
|
This is the **aarch64-unknown-linux-musl** binary for `@lancedb/lancedb`
|
||||||
13
nodejs/npm/linux-arm64-musl/package.json
Normal file
13
nodejs/npm/linux-arm64-musl/package.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
|
"version": "0.13.0",
|
||||||
|
"os": ["linux"],
|
||||||
|
"cpu": ["arm64"],
|
||||||
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
"files": ["lancedb.linux-arm64-musl.node"],
|
||||||
|
"license": "Apache 2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 18"
|
||||||
|
},
|
||||||
|
"libc": ["musl"]
|
||||||
|
}
|
||||||
3
nodejs/npm/linux-x64-musl/README.md
Normal file
3
nodejs/npm/linux-x64-musl/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# `@lancedb/lancedb-linux-x64-musl`
|
||||||
|
|
||||||
|
This is the **x86_64-unknown-linux-musl** binary for `@lancedb/lancedb`
|
||||||
13
nodejs/npm/linux-x64-musl/package.json
Normal file
13
nodejs/npm/linux-x64-musl/package.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
|
"version": "0.13.0",
|
||||||
|
"os": ["linux"],
|
||||||
|
"cpu": ["x64"],
|
||||||
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
"files": ["lancedb.linux-x64-musl.node"],
|
||||||
|
"license": "Apache 2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 18"
|
||||||
|
},
|
||||||
|
"libc": ["musl"]
|
||||||
|
}
|
||||||
@@ -24,10 +24,12 @@
|
|||||||
"triples": {
|
"triples": {
|
||||||
"defaults": false,
|
"defaults": false,
|
||||||
"additional": [
|
"additional": [
|
||||||
"aarch64-apple-darwin",
|
|
||||||
"aarch64-unknown-linux-gnu",
|
|
||||||
"x86_64-apple-darwin",
|
"x86_64-apple-darwin",
|
||||||
|
"aarch64-apple-darwin",
|
||||||
"x86_64-unknown-linux-gnu",
|
"x86_64-unknown-linux-gnu",
|
||||||
|
"aarch64-unknown-linux-gnu",
|
||||||
|
"x86_64-unknown-linux-musl",
|
||||||
|
"aarch64-unknown-linux-musl",
|
||||||
"x86_64-pc-windows-msvc"
|
"x86_64-pc-windows-msvc"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,8 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use arrow_ipc::writer::FileWriter;
|
use arrow_ipc::writer::FileWriter;
|
||||||
use lancedb::ipc::ipc_file_to_batches;
|
use lancedb::ipc::ipc_file_to_batches;
|
||||||
use lancedb::table::{
|
use lancedb::table::{
|
||||||
@@ -226,6 +228,28 @@ impl Table {
|
|||||||
self.inner_ref()?.checkout_latest().await.default_error()
|
self.inner_ref()?.checkout_latest().await.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub async fn list_versions(&self) -> napi::Result<Vec<Version>> {
|
||||||
|
self.inner_ref()?
|
||||||
|
.list_versions()
|
||||||
|
.await
|
||||||
|
.map(|versions| {
|
||||||
|
versions
|
||||||
|
.iter()
|
||||||
|
.map(|version| Version {
|
||||||
|
version: version.version as i64,
|
||||||
|
timestamp: version.timestamp.timestamp_micros(),
|
||||||
|
metadata: version
|
||||||
|
.metadata
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.clone(), v.clone()))
|
||||||
|
.collect(),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.default_error()
|
||||||
|
}
|
||||||
|
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn restore(&self) -> napi::Result<()> {
|
pub async fn restore(&self) -> napi::Result<()> {
|
||||||
self.inner_ref()?.restore().await.default_error()
|
self.inner_ref()?.restore().await.default_error()
|
||||||
@@ -466,3 +490,10 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct Version {
|
||||||
|
pub version: i64,
|
||||||
|
pub timestamp: i64,
|
||||||
|
pub metadata: HashMap<String, String>,
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.16.0"
|
current_version = "0.16.1-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.16.0"
|
version = "0.16.1-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ name = "lancedb"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"nest-asyncio~=1.0",
|
"nest-asyncio~=1.0",
|
||||||
"pylance==0.19.3b1",
|
"pylance==0.20.0b2",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
"packaging",
|
"packaging",
|
||||||
|
|||||||
@@ -370,11 +370,13 @@ class LanceQueryBuilder(ABC):
|
|||||||
----------
|
----------
|
||||||
limit: int
|
limit: int
|
||||||
The maximum number of results to return.
|
The maximum number of results to return.
|
||||||
By default the query is limited to the first 10.
|
The default query limit is 10 results.
|
||||||
Call this method and pass 0, a negative value,
|
For ANN/KNN queries, you must specify a limit.
|
||||||
or None to remove the limit.
|
Entering 0, a negative number, or None will reset
|
||||||
*WARNING* if you have a large dataset, removing
|
the limit to the default value of 10.
|
||||||
the limit can potentially result in reading a
|
*WARNING* if you have a large dataset, setting
|
||||||
|
the limit to a large number, e.g. the table size,
|
||||||
|
can potentially result in reading a
|
||||||
large amount of data into memory and cause
|
large amount of data into memory and cause
|
||||||
out of memory issues.
|
out of memory issues.
|
||||||
|
|
||||||
|
|||||||
@@ -78,6 +78,10 @@ class RemoteTable(Table):
|
|||||||
self.schema.metadata
|
self.schema.metadata
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def list_versions(self):
|
||||||
|
"""List all versions of the table"""
|
||||||
|
return self._loop.run_until_complete(self._table.list_versions())
|
||||||
|
|
||||||
def to_arrow(self) -> pa.Table:
|
def to_arrow(self) -> pa.Table:
|
||||||
"""to_arrow() is not yet supported on LanceDB cloud."""
|
"""to_arrow() is not yet supported on LanceDB cloud."""
|
||||||
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")
|
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ class CohereReranker(Reranker):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_name: str = "rerank-english-v2.0",
|
model_name: str = "rerank-english-v3.0",
|
||||||
column: str = "text",
|
column: str = "text",
|
||||||
top_n: Union[int, None] = None,
|
top_n: Union[int, None] = None,
|
||||||
return_score="relevance",
|
return_score="relevance",
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import inspect
|
|||||||
import time
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import timedelta
|
from datetime import datetime, timedelta
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
@@ -1015,15 +1015,36 @@ class Table(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def checkout(self):
|
def checkout(self):
|
||||||
"""
|
"""
|
||||||
TODO comments
|
Checks out a specific version of the Table
|
||||||
|
|
||||||
|
Any read operation on the table will now access the data at the checked out
|
||||||
|
version. As a consequence, calling this method will disable any read consistency
|
||||||
|
interval that was previously set.
|
||||||
|
|
||||||
|
This is a read-only operation that turns the table into a sort of "view"
|
||||||
|
or "detached head". Other table instances will not be affected. To make the
|
||||||
|
change permanent you can use the `[Self::restore]` method.
|
||||||
|
|
||||||
|
Any operation that modifies the table will fail while the table is in a checked
|
||||||
|
out state.
|
||||||
|
|
||||||
|
To return the table to a normal state use `[Self::checkout_latest]`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def checkout_latest(self):
|
def checkout_latest(self):
|
||||||
"""
|
"""
|
||||||
TODO comments
|
Ensures the table is pointing at the latest version
|
||||||
|
|
||||||
|
This can be used to manually update a table when the read_consistency_interval
|
||||||
|
is None
|
||||||
|
It can also be used to undo a `[Self::checkout]` operation
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def list_versions(self):
|
||||||
|
"""List all versions of the table"""
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _dataset_uri(self) -> str:
|
def _dataset_uri(self) -> str:
|
||||||
return _table_uri(self._conn.uri, self.name)
|
return _table_uri(self._conn.uri, self.name)
|
||||||
@@ -2914,6 +2935,19 @@ class AsyncTable:
|
|||||||
"""
|
"""
|
||||||
return await self._inner.version()
|
return await self._inner.version()
|
||||||
|
|
||||||
|
async def list_versions(self):
|
||||||
|
"""
|
||||||
|
List all versions of the table
|
||||||
|
"""
|
||||||
|
versions = await self._inner.list_versions()
|
||||||
|
for v in versions:
|
||||||
|
ts_nanos = v["timestamp"]
|
||||||
|
v["timestamp"] = datetime.fromtimestamp(ts_nanos // 1e9) + timedelta(
|
||||||
|
microseconds=(ts_nanos % 1e9) // 1e3
|
||||||
|
)
|
||||||
|
|
||||||
|
return versions
|
||||||
|
|
||||||
async def checkout(self, version):
|
async def checkout(self, version):
|
||||||
"""
|
"""
|
||||||
Checks out a specific version of the Table
|
Checks out a specific version of the Table
|
||||||
|
|||||||
@@ -103,6 +103,47 @@ async def test_async_remote_db():
|
|||||||
assert table_names == []
|
assert table_names == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_async_checkout():
|
||||||
|
def handler(request):
|
||||||
|
if request.path == "/v1/table/test/describe/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
response = json.dumps({"version": 42, "schema": {"fields": []}})
|
||||||
|
request.wfile.write(response.encode())
|
||||||
|
return
|
||||||
|
|
||||||
|
content_len = int(request.headers.get("Content-Length"))
|
||||||
|
body = request.rfile.read(content_len)
|
||||||
|
body = json.loads(body)
|
||||||
|
|
||||||
|
print("body is", body)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
if body["version"] == 1:
|
||||||
|
count = 100
|
||||||
|
elif body["version"] == 2:
|
||||||
|
count = 200
|
||||||
|
elif body["version"] is None:
|
||||||
|
count = 300
|
||||||
|
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
request.wfile.write(json.dumps(count).encode())
|
||||||
|
|
||||||
|
async with mock_lancedb_connection_async(handler) as db:
|
||||||
|
table = await db.open_table("test")
|
||||||
|
assert await table.count_rows() == 300
|
||||||
|
await table.checkout(1)
|
||||||
|
assert await table.count_rows() == 100
|
||||||
|
await table.checkout(2)
|
||||||
|
assert await table.count_rows() == 200
|
||||||
|
await table.checkout_latest()
|
||||||
|
assert await table.count_rows() == 300
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_http_error():
|
async def test_http_error():
|
||||||
request_id_holder = {"request_id": None}
|
request_id_holder = {"request_id": None}
|
||||||
@@ -188,6 +229,7 @@ def test_query_sync_minimal():
|
|||||||
"ef": None,
|
"ef": None,
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
@@ -205,6 +247,7 @@ def test_query_sync_empty_query():
|
|||||||
"filter": "true",
|
"filter": "true",
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"columns": ["id"],
|
"columns": ["id"],
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
@@ -230,6 +273,7 @@ def test_query_sync_maximal():
|
|||||||
"vector_column": "vector2",
|
"vector_column": "vector2",
|
||||||
"fast_search": True,
|
"fast_search": True,
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
||||||
@@ -268,6 +312,7 @@ def test_query_sync_fts():
|
|||||||
},
|
},
|
||||||
"k": 10,
|
"k": 10,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
@@ -284,6 +329,7 @@ def test_query_sync_fts():
|
|||||||
"k": 42,
|
"k": 42,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
@@ -309,6 +355,7 @@ def test_query_sync_hybrid():
|
|||||||
"k": 42,
|
"k": 42,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
|
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
|
||||||
else:
|
else:
|
||||||
@@ -322,6 +369,7 @@ def test_query_sync_hybrid():
|
|||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
"ef": None,
|
"ef": None,
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})
|
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use lancedb::table::{
|
|||||||
use pyo3::{
|
use pyo3::{
|
||||||
exceptions::{PyRuntimeError, PyValueError},
|
exceptions::{PyRuntimeError, PyValueError},
|
||||||
pyclass, pymethods,
|
pyclass, pymethods,
|
||||||
types::{PyDict, PyDictMethods, PyString},
|
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
|
||||||
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
||||||
};
|
};
|
||||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||||
@@ -246,6 +246,33 @@ impl Table {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn list_versions(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
|
let inner = self_.inner_ref()?.clone();
|
||||||
|
future_into_py(self_.py(), async move {
|
||||||
|
let versions = inner.list_versions().await.infer_error()?;
|
||||||
|
let versions_as_dict = Python::with_gil(|py| {
|
||||||
|
versions
|
||||||
|
.iter()
|
||||||
|
.map(|v| {
|
||||||
|
let dict = PyDict::new_bound(py);
|
||||||
|
dict.set_item("version", v.version).unwrap();
|
||||||
|
dict.set_item(
|
||||||
|
"timestamp",
|
||||||
|
v.timestamp.timestamp_nanos_opt().unwrap_or_default(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let tup: Vec<(&String, &String)> = v.metadata.iter().collect();
|
||||||
|
dict.set_item("metadata", tup.into_py_dict(py)).unwrap();
|
||||||
|
dict.to_object(py)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(versions_as_dict)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
|
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner_ref()?.clone();
|
let inner = self_.inner_ref()?.clone();
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ use http::header::CONTENT_TYPE;
|
|||||||
use http::StatusCode;
|
use http::StatusCode;
|
||||||
use lance::arrow::json::JsonSchema;
|
use lance::arrow::json::JsonSchema;
|
||||||
use lance::dataset::scanner::DatasetRecordBatchStream;
|
use lance::dataset::scanner::DatasetRecordBatchStream;
|
||||||
use lance::dataset::{ColumnAlteration, NewColumnTransform};
|
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
||||||
use lance_datafusion::exec::OneShotExec;
|
use lance_datafusion::exec::OneShotExec;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
@@ -363,6 +363,34 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
message: "restore is not supported on LanceDB cloud.".into(),
|
message: "restore is not supported on LanceDB cloud.".into(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
let request = self
|
||||||
|
.client
|
||||||
|
.post(&format!("/v1/table/{}/version/list/", self.name));
|
||||||
|
let (request_id, response) = self.client.send(request, true).await?;
|
||||||
|
let response = self.check_table_response(&request_id, response).await?;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ListVersionsResponse {
|
||||||
|
versions: Vec<Version>,
|
||||||
|
}
|
||||||
|
|
||||||
|
let body = response.text().await.err_to_http(request_id.clone())?;
|
||||||
|
let body: ListVersionsResponse =
|
||||||
|
serde_json::from_str(&body).map_err(|err| Error::Http {
|
||||||
|
source: format!(
|
||||||
|
"Failed to parse list_versions response: {}, body: {}",
|
||||||
|
err, body
|
||||||
|
)
|
||||||
|
.into(),
|
||||||
|
request_id,
|
||||||
|
status_code: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(body.versions)
|
||||||
|
}
|
||||||
|
|
||||||
async fn schema(&self) -> Result<SchemaRef> {
|
async fn schema(&self) -> Result<SchemaRef> {
|
||||||
let schema = self.describe().await?.schema;
|
let schema = self.describe().await?.schema;
|
||||||
Ok(Arc::new(schema.try_into()?))
|
Ok(Arc::new(schema.try_into()?))
|
||||||
@@ -775,6 +803,7 @@ mod tests {
|
|||||||
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
||||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
||||||
use arrow_schema::{DataType, Field, Schema};
|
use arrow_schema::{DataType, Field, Schema};
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||||
use lance_index::scalar::FullTextSearchQuery;
|
use lance_index::scalar::FullTextSearchQuery;
|
||||||
use reqwest::Body;
|
use reqwest::Body;
|
||||||
@@ -1489,6 +1518,51 @@ mod tests {
|
|||||||
assert_eq!(indices, expected);
|
assert_eq!(indices, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_list_versions() {
|
||||||
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
|
assert_eq!(request.method(), "POST");
|
||||||
|
assert_eq!(request.url().path(), "/v1/table/my_table/version/list/");
|
||||||
|
|
||||||
|
let version1 = lance::dataset::Version {
|
||||||
|
version: 1,
|
||||||
|
timestamp: "2024-01-01T00:00:00Z".parse().unwrap(),
|
||||||
|
metadata: Default::default(),
|
||||||
|
};
|
||||||
|
let version2 = lance::dataset::Version {
|
||||||
|
version: 2,
|
||||||
|
timestamp: "2024-02-01T00:00:00Z".parse().unwrap(),
|
||||||
|
metadata: Default::default(),
|
||||||
|
};
|
||||||
|
let response_body = serde_json::json!({
|
||||||
|
"versions": [
|
||||||
|
version1,
|
||||||
|
version2,
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let response_body = serde_json::to_string(&response_body).unwrap();
|
||||||
|
|
||||||
|
http::Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.body(response_body)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
let versions = table.list_versions().await.unwrap();
|
||||||
|
assert_eq!(versions.len(), 2);
|
||||||
|
assert_eq!(versions[0].version, 1);
|
||||||
|
assert_eq!(
|
||||||
|
versions[0].timestamp,
|
||||||
|
"2024-01-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
|
||||||
|
);
|
||||||
|
assert_eq!(versions[1].version, 2);
|
||||||
|
assert_eq!(
|
||||||
|
versions[1].timestamp,
|
||||||
|
"2024-02-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
|
||||||
|
);
|
||||||
|
// assert_eq!(versions, expected);
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_index_stats() {
|
async fn test_index_stats() {
|
||||||
let table = Table::new_with_handler("my_table", |request| {
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ pub use lance::dataset::ColumnAlteration;
|
|||||||
pub use lance::dataset::NewColumnTransform;
|
pub use lance::dataset::NewColumnTransform;
|
||||||
pub use lance::dataset::ReadParams;
|
pub use lance::dataset::ReadParams;
|
||||||
use lance::dataset::{
|
use lance::dataset::{
|
||||||
Dataset, UpdateBuilder as LanceUpdateBuilder, WhenMatched, WriteMode, WriteParams,
|
Dataset, UpdateBuilder as LanceUpdateBuilder, Version, WhenMatched, WriteMode, WriteParams,
|
||||||
};
|
};
|
||||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||||
use lance::io::WrappingObjectStore;
|
use lance::io::WrappingObjectStore;
|
||||||
@@ -426,6 +426,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
|
|||||||
async fn checkout(&self, version: u64) -> Result<()>;
|
async fn checkout(&self, version: u64) -> Result<()>;
|
||||||
async fn checkout_latest(&self) -> Result<()>;
|
async fn checkout_latest(&self) -> Result<()>;
|
||||||
async fn restore(&self) -> Result<()>;
|
async fn restore(&self) -> Result<()>;
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>>;
|
||||||
async fn table_definition(&self) -> Result<TableDefinition>;
|
async fn table_definition(&self) -> Result<TableDefinition>;
|
||||||
fn dataset_uri(&self) -> &str;
|
fn dataset_uri(&self) -> &str;
|
||||||
}
|
}
|
||||||
@@ -955,6 +956,11 @@ impl Table {
|
|||||||
self.inner.restore().await
|
self.inner.restore().await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// List all the versions of the table
|
||||||
|
pub async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
self.inner.list_versions().await
|
||||||
|
}
|
||||||
|
|
||||||
/// List all indices that have been created with [`Self::create_index`]
|
/// List all indices that have been created with [`Self::create_index`]
|
||||||
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
||||||
self.inner.list_indices().await
|
self.inner.list_indices().await
|
||||||
@@ -1319,7 +1325,7 @@ impl NativeTable {
|
|||||||
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
||||||
Ok(indices
|
Ok(indices
|
||||||
.iter()
|
.iter()
|
||||||
.map(|i| VectorIndex::new_from_format(&mf, i))
|
.map(|i| VectorIndex::new_from_format(&(mf.0), i))
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1707,6 +1713,10 @@ impl TableInternal for NativeTable {
|
|||||||
self.dataset.reload().await
|
self.dataset.reload().await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
Ok(self.dataset.get().await?.versions().await?)
|
||||||
|
}
|
||||||
|
|
||||||
async fn restore(&self) -> Result<()> {
|
async fn restore(&self) -> Result<()> {
|
||||||
let version =
|
let version =
|
||||||
self.dataset
|
self.dataset
|
||||||
|
|||||||
Reference in New Issue
Block a user