mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-13 07:12:57 +00:00
Compare commits
15 Commits
remote-ver
...
rmeng/pool
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cc7a503faa | ||
|
|
2ded17452b | ||
|
|
dfd9d2ac99 | ||
|
|
162880140e | ||
|
|
99d9ced6d5 | ||
|
|
96933d7df8 | ||
|
|
d369233b3d | ||
|
|
43a670ed4b | ||
|
|
cb9a00a28d | ||
|
|
72af977a73 | ||
|
|
7cecb71df0 | ||
|
|
285071e5c8 | ||
|
|
114866fbcf | ||
|
|
5387c0e243 | ||
|
|
53d1535de1 |
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.13.0"
|
current_version = "0.13.1-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
@@ -87,6 +87,16 @@ glob = "node/package.json"
|
|||||||
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
||||||
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
glob = "node/package.json"
|
||||||
|
replace = "\"@lancedb/vectordb-linux-arm64-musl\": \"{new_version}\""
|
||||||
|
search = "\"@lancedb/vectordb-linux-arm64-musl\": \"{current_version}\""
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
glob = "node/package.json"
|
||||||
|
replace = "\"@lancedb/vectordb-linux-x64-musl\": \"{new_version}\""
|
||||||
|
search = "\"@lancedb/vectordb-linux-x64-musl\": \"{current_version}\""
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
[[tool.bumpversion.files]]
|
||||||
glob = "node/package.json"
|
glob = "node/package.json"
|
||||||
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
||||||
|
|||||||
@@ -31,6 +31,9 @@ rustflags = [
|
|||||||
[target.x86_64-unknown-linux-gnu]
|
[target.x86_64-unknown-linux-gnu]
|
||||||
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
|
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
|
||||||
|
|
||||||
|
[target.x86_64-unknown-linux-musl]
|
||||||
|
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=-crt-static,+avx2,+fma,+f16c"]
|
||||||
|
|
||||||
[target.aarch64-apple-darwin]
|
[target.aarch64-apple-darwin]
|
||||||
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]
|
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]
|
||||||
|
|
||||||
|
|||||||
120
.github/workflows/npm-publish.yml
vendored
120
.github/workflows/npm-publish.yml
vendored
@@ -101,7 +101,7 @@ jobs:
|
|||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
|
|
||||||
node-linux:
|
node-linux-gnu:
|
||||||
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
@@ -137,11 +137,63 @@ jobs:
|
|||||||
- name: Upload Linux Artifacts
|
- name: Upload Linux Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: node-native-linux-${{ matrix.config.arch }}
|
name: node-native-linux-${{ matrix.config.arch }}-gnu
|
||||||
path: |
|
path: |
|
||||||
node/dist/lancedb-vectordb-linux*.tgz
|
node/dist/lancedb-vectordb-linux*.tgz
|
||||||
|
|
||||||
nodejs-linux:
|
node-linux-musl:
|
||||||
|
name: vectordb (${{ matrix.config.arch}}-unknown-linux-musl)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: alpine:edge
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- arch: x86_64
|
||||||
|
- arch: aarch64
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install common dependencies
|
||||||
|
run: |
|
||||||
|
apk add protobuf-dev curl clang mold grep npm bash
|
||||||
|
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
|
||||||
|
echo "source $HOME/.cargo/env" >> saved_env
|
||||||
|
echo "export CC=clang" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
|
||||||
|
- name: Configure aarch64 build
|
||||||
|
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||||
|
run: |
|
||||||
|
source "$HOME/.cargo/env"
|
||||||
|
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
|
||||||
|
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
|
||||||
|
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
|
||||||
|
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
|
||||||
|
curl -sSf $apk_url > apk_list
|
||||||
|
for pkg in gcc libgcc musl; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
|
||||||
|
mkdir -p $sysroot_lib
|
||||||
|
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
|
||||||
|
cp usr/lib/libgcc_s.so.1 $sysroot_lib
|
||||||
|
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
|
||||||
|
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
|
||||||
|
echo '!<arch>' > $sysroot_lib/libdl.a
|
||||||
|
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
|
||||||
|
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=apple-m1 -Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
|
||||||
|
- name: Build Linux Artifacts
|
||||||
|
run: |
|
||||||
|
source ./saved_env
|
||||||
|
bash ci/manylinux_node/build_vectordb.sh ${{ matrix.config.arch }}
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: node-native-linux-${{ matrix.config.arch }}-musl
|
||||||
|
path: |
|
||||||
|
node/dist/lancedb-vectordb-linux*.tgz
|
||||||
|
|
||||||
|
nodejs-linux-gnu:
|
||||||
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
|
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
@@ -178,7 +230,7 @@ jobs:
|
|||||||
- name: Upload Linux Artifacts
|
- name: Upload Linux Artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: nodejs-native-linux-${{ matrix.config.arch }}
|
name: nodejs-native-linux-${{ matrix.config.arch }}-gnu
|
||||||
path: |
|
path: |
|
||||||
nodejs/dist/*.node
|
nodejs/dist/*.node
|
||||||
# The generic files are the same in all distros so we just pick
|
# The generic files are the same in all distros so we just pick
|
||||||
@@ -192,6 +244,62 @@ jobs:
|
|||||||
nodejs/dist/*
|
nodejs/dist/*
|
||||||
!nodejs/dist/*.node
|
!nodejs/dist/*.node
|
||||||
|
|
||||||
|
nodejs-linux-musl:
|
||||||
|
name: lancedb (${{ matrix.config.arch}}-unknown-linux-musl
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
container: alpine:edge
|
||||||
|
# Only runs on tags that matches the make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- arch: x86_64
|
||||||
|
- arch: aarch64
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install common dependencies
|
||||||
|
run: |
|
||||||
|
apk add protobuf-dev curl clang mold grep npm bash openssl-dev openssl-libs-static
|
||||||
|
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
|
||||||
|
echo "source $HOME/.cargo/env" >> saved_env
|
||||||
|
echo "export CC=clang" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
|
||||||
|
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=/usr/include" >> saved_env
|
||||||
|
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=/usr/lib" >> saved_env
|
||||||
|
- name: Configure aarch64 build
|
||||||
|
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||||
|
run: |
|
||||||
|
source "$HOME/.cargo/env"
|
||||||
|
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
|
||||||
|
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
|
||||||
|
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
|
||||||
|
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
|
||||||
|
curl -sSf $apk_url > apk_list
|
||||||
|
for pkg in gcc libgcc musl openssl-dev openssl-libs-static; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
|
||||||
|
mkdir -p $sysroot_lib
|
||||||
|
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
|
||||||
|
cp usr/lib/libgcc_s.so.1 $sysroot_lib
|
||||||
|
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
|
||||||
|
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
|
||||||
|
echo '!<arch>' > $sysroot_lib/libdl.a
|
||||||
|
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
|
||||||
|
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
|
||||||
|
echo "export RUSTFLAGS='-Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
|
||||||
|
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=$(realpath usr/include)" >> saved_env
|
||||||
|
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=$(realpath usr/lib)" >> saved_env
|
||||||
|
- name: Build Linux Artifacts
|
||||||
|
run: |
|
||||||
|
source ./saved_env
|
||||||
|
bash ci/manylinux_node/build_lancedb.sh ${{ matrix.config.arch }}
|
||||||
|
- name: Upload Linux Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: nodejs-native-linux-${{ matrix.config.arch }}-musl
|
||||||
|
path: |
|
||||||
|
nodejs/dist/*.node
|
||||||
|
|
||||||
node-windows:
|
node-windows:
|
||||||
name: vectordb ${{ matrix.target }}
|
name: vectordb ${{ matrix.target }}
|
||||||
runs-on: windows-2022
|
runs-on: windows-2022
|
||||||
@@ -460,7 +568,7 @@ jobs:
|
|||||||
|
|
||||||
release:
|
release:
|
||||||
name: vectordb NPM Publish
|
name: vectordb NPM Publish
|
||||||
needs: [node, node-macos, node-linux, node-windows]
|
needs: [node, node-macos, node-linux-gnu, node-linux-musl, node-windows]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
@@ -500,7 +608,7 @@ jobs:
|
|||||||
|
|
||||||
release-nodejs:
|
release-nodejs:
|
||||||
name: lancedb NPM Publish
|
name: lancedb NPM Publish
|
||||||
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
|
needs: [nodejs-macos, nodejs-linux-gnu, nodejs-linux-musl, nodejs-windows]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# Only runs on tags that matches the make-release action
|
# Only runs on tags that matches the make-release action
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
|||||||
16
Cargo.toml
16
Cargo.toml
@@ -21,15 +21,15 @@ categories = ["database-implementations"]
|
|||||||
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.19.3", "features" = [
|
lance = { "version" = "=0.20.0", "features" = [
|
||||||
"dynamodb",
|
"dynamodb",
|
||||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-index = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-linalg = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-table = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-testing = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-datafusion = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
lance-encoding = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "52.2", optional = false }
|
arrow = { version = "52.2", optional = false }
|
||||||
arrow-array = "52.2"
|
arrow-array = "52.2"
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ fi
|
|||||||
export OPENSSL_STATIC=1
|
export OPENSSL_STATIC=1
|
||||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||||
|
|
||||||
source $HOME/.bashrc
|
#Alpine doesn't have .bashrc
|
||||||
|
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||||
|
|
||||||
cd nodejs
|
cd nodejs
|
||||||
npm ci
|
npm ci
|
||||||
|
|||||||
@@ -5,13 +5,14 @@ ARCH=${1:-x86_64}
|
|||||||
|
|
||||||
if [ "$ARCH" = "x86_64" ]; then
|
if [ "$ARCH" = "x86_64" ]; then
|
||||||
export OPENSSL_LIB_DIR=/usr/local/lib64/
|
export OPENSSL_LIB_DIR=/usr/local/lib64/
|
||||||
else
|
else
|
||||||
export OPENSSL_LIB_DIR=/usr/local/lib/
|
export OPENSSL_LIB_DIR=/usr/local/lib/
|
||||||
fi
|
fi
|
||||||
export OPENSSL_STATIC=1
|
export OPENSSL_STATIC=1
|
||||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||||
|
|
||||||
source $HOME/.bashrc
|
#Alpine doesn't have .bashrc
|
||||||
|
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||||
|
|
||||||
cd node
|
cd node
|
||||||
npm ci
|
npm ci
|
||||||
|
|||||||
@@ -138,6 +138,7 @@ nav:
|
|||||||
- Jina Reranker: reranking/jina.md
|
- Jina Reranker: reranking/jina.md
|
||||||
- OpenAI Reranker: reranking/openai.md
|
- OpenAI Reranker: reranking/openai.md
|
||||||
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
||||||
|
- Voyage AI Rerankers: reranking/voyageai.md
|
||||||
- Building Custom Rerankers: reranking/custom_reranker.md
|
- Building Custom Rerankers: reranking/custom_reranker.md
|
||||||
- Example: notebooks/lancedb_reranking.ipynb
|
- Example: notebooks/lancedb_reranking.ipynb
|
||||||
- Filtering: sql.md
|
- Filtering: sql.md
|
||||||
@@ -165,6 +166,7 @@ nav:
|
|||||||
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
||||||
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
||||||
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
||||||
|
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
|
||||||
- Multimodal Embedding Functions:
|
- Multimodal Embedding Functions:
|
||||||
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
||||||
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
||||||
|
|||||||
@@ -277,7 +277,15 @@ Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` t
|
|||||||
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
||||||
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
||||||
|
|
||||||
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because
|
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. The number should be a factor of the vector dimension. Because
|
||||||
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
||||||
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
|
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
||||||
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
|
||||||
|
!!! note
|
||||||
|
if `num_sub_vectors` is set to be greater than the vector dimension, you will see errors like `attempt to divide by zero`
|
||||||
|
|
||||||
|
### How to choose `m` and `ef_construction` for `IVF_HNSW_*` index?
|
||||||
|
|
||||||
|
`m` determines the number of connections a new node establishes with its closest neighbors upon entering the graph. Typically, `m` falls within the range of 5 to 48. Lower `m` values are suitable for low-dimensional data or scenarios where recall is less critical. Conversely, higher `m` values are beneficial for high-dimensional data or when high recall is required. In essence, a larger `m` results in a denser graph with increased connectivity, but at the expense of higher memory consumption.
|
||||||
|
|
||||||
|
`ef_construction` balances build speed and accuracy. Higher values increase accuracy but slow down the build process. A typical range is 150 to 300. For good search results, a minimum value of 100 is recommended. In most cases, setting this value above 500 offers no additional benefit. Ensure that `ef_construction` is always set to a value equal to or greater than `ef` in the search phase
|
||||||
@@ -57,6 +57,13 @@ Then the greedy search routine operates as follows:
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
There are three key parameters to set when constructing an HNSW index:
|
||||||
|
|
||||||
|
* `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
|
||||||
|
* `m`: The number of neighbors to select for each vector in the HNSW graph.
|
||||||
|
* `ef_construction`: The number of candidates to evaluate during the construction of the HNSW graph.
|
||||||
|
|
||||||
|
|
||||||
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
|
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
|
||||||
|
|
||||||
### Construct index
|
### Construct index
|
||||||
|
|||||||
@@ -58,8 +58,10 @@ In Python, the index can be created as follows:
|
|||||||
# Make sure you have enough data in the table for an effective training step
|
# Make sure you have enough data in the table for an effective training step
|
||||||
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
|
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
|
||||||
```
|
```
|
||||||
|
!!! note
|
||||||
|
`num_partitions`=256 and `num_sub_vectors`=96 does not work for every dataset. Those values needs to be adjusted for your particular dataset.
|
||||||
|
|
||||||
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See the [FAQs](#faq) below for best practices on choosing these parameters.
|
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See [here](../ann_indexes.md/#how-to-choose-num_partitions-and-num_sub_vectors-for-ivf_pq-index) for best practices on choosing these parameters.
|
||||||
|
|
||||||
|
|
||||||
### Query the index
|
### Query the index
|
||||||
|
|||||||
@@ -114,12 +114,45 @@ table.create_fts_index("text",
|
|||||||
|
|
||||||
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
|
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
|
||||||
|
|
||||||
This can be invoked via the familiar `where` syntax:
|
This can be invoked via the familiar `where` syntax.
|
||||||
|
|
||||||
|
With pre-filtering:
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
table.search("puppy").limit(10).where("meta='foo'").to_list()
|
table.search("puppy").limit(10).where("meta='foo'", prefilte=True).to_list()
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "TypeScript"
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
await tbl
|
||||||
|
.search("puppy")
|
||||||
|
.select(["id", "doc"])
|
||||||
|
.limit(10)
|
||||||
|
.where("meta='foo'")
|
||||||
|
.prefilter(true)
|
||||||
|
.toArray();
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Rust"
|
||||||
|
|
||||||
|
```rust
|
||||||
|
table
|
||||||
|
.query()
|
||||||
|
.full_text_search(FullTextSearchQuery::new("puppy".to_owned()))
|
||||||
|
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
||||||
|
.limit(10)
|
||||||
|
.only_if("meta='foo'")
|
||||||
|
.execute()
|
||||||
|
.await?;
|
||||||
|
```
|
||||||
|
|
||||||
|
With post-filtering:
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```python
|
||||||
|
table.search("puppy").limit(10).where("meta='foo'", prefilte=False).to_list()
|
||||||
```
|
```
|
||||||
|
|
||||||
=== "TypeScript"
|
=== "TypeScript"
|
||||||
@@ -130,6 +163,7 @@ This can be invoked via the familiar `where` syntax:
|
|||||||
.select(["id", "doc"])
|
.select(["id", "doc"])
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.where("meta='foo'")
|
.where("meta='foo'")
|
||||||
|
.prefilter(false)
|
||||||
.toArray();
|
.toArray();
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -140,6 +174,7 @@ This can be invoked via the familiar `where` syntax:
|
|||||||
.query()
|
.query()
|
||||||
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
|
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
|
||||||
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
||||||
|
.postfilter()
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.only_if("meta='foo'")
|
.only_if("meta='foo'")
|
||||||
.execute()
|
.execute()
|
||||||
@@ -189,3 +224,6 @@ This can make the query more efficient, especially when the table is large and t
|
|||||||
tbl.add(more_data).execute().await?;
|
tbl.add(more_data).execute().await?;
|
||||||
tbl.optimize(OptimizeAction::All).execute().await?;
|
tbl.optimize(OptimizeAction::All).execute().await?;
|
||||||
```
|
```
|
||||||
|
!!! note
|
||||||
|
|
||||||
|
New data added after creating the FTS index will appear in search results while incremental index is still progress, but with increased latency due to a flat search on the unindexed portion. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
|
||||||
@@ -153,9 +153,7 @@ table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=
|
|||||||
|
|
||||||
## Current limitations
|
## Current limitations
|
||||||
|
|
||||||
1. Currently we do not yet support incremental writes.
|
1. New data added after creating the FTS index will appear in search results, but with increased latency due to a flat search on the unindexed portion. Re-indexing with `create_fts_index` will reduce latency. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
|
||||||
If you add data after FTS index creation, it won't be reflected
|
|
||||||
in search results until you do a full reindex.
|
|
||||||
|
|
||||||
2. We currently only support local filesystem paths for the FTS index.
|
2. We currently only support local filesystem paths for the FTS index.
|
||||||
This is a tantivy limitation. We've implemented an object store plugin
|
This is a tantivy limitation. We've implemented an object store plugin
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ This re-ranker uses the [Cohere](https://cohere.ai/) API to rerank the search re
|
|||||||
!!! note
|
!!! note
|
||||||
Supported Query Types: Hybrid, Vector, FTS
|
Supported Query Types: Hybrid, Vector, FTS
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install cohere
|
||||||
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import numpy
|
import numpy
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.13.0-final.0</version>
|
<version>0.13.1-beta.0</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>com.lancedb</groupId>
|
<groupId>com.lancedb</groupId>
|
||||||
<artifactId>lancedb-parent</artifactId>
|
<artifactId>lancedb-parent</artifactId>
|
||||||
<version>0.13.0-final.0</version>
|
<version>0.13.1-beta.0</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<name>LanceDB Parent</name>
|
<name>LanceDB Parent</name>
|
||||||
|
|||||||
78
node/package-lock.json
generated
78
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
@@ -52,12 +52,14 @@
|
|||||||
"uuid": "^9.0.0"
|
"uuid": "^9.0.0"
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0",
|
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0",
|
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
|
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0"
|
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
|
||||||
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0",
|
||||||
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0"
|
||||||
},
|
},
|
||||||
"peerDependencies": {
|
"peerDependencies": {
|
||||||
"@apache-arrow/ts": "^14.0.2",
|
"@apache-arrow/ts": "^14.0.2",
|
||||||
@@ -327,66 +329,6 @@
|
|||||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
|
||||||
"version": "0.13.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.13.0.tgz",
|
|
||||||
"integrity": "sha512-8hdcjkRmgrdQYf1jN+DyZae40LIv8UUfnWy70Uid5qy63sSvRW/+MvIdqIPFr9QlLUXmpyyQuX0y3bZhUR99cQ==",
|
|
||||||
"cpu": [
|
|
||||||
"arm64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"darwin"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
|
||||||
"version": "0.13.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.13.0.tgz",
|
|
||||||
"integrity": "sha512-fWzAY4l5SQtNfMYh80v+M66ugZHhdxbkpk5mNEv6Zsug3DL6kRj3Uv31/i0wgzY6F5G3LUlbjZerN+eTnDLwOw==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"darwin"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
|
||||||
"version": "0.13.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.13.0.tgz",
|
|
||||||
"integrity": "sha512-ltwAT9baOSuR5YiGykQXPC8/HGYF13vpI47qxhP9yfgiz9pA8EUn8p8YrBRzq7J4DIZ4b8JSVDXQnMIqEtB4Kg==",
|
|
||||||
"cpu": [
|
|
||||||
"arm64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"linux"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
|
||||||
"version": "0.13.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.13.0.tgz",
|
|
||||||
"integrity": "sha512-MiT/RBlMPGGRh7BX+MXwRuNiiUnKmuDcHH8nm88IH28T7TQxXIbA9w6UpSg5m9f3DgKQI2K8oLi29oKIB8ZwDQ==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"linux"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
|
||||||
"version": "0.13.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.13.0.tgz",
|
|
||||||
"integrity": "sha512-SovP/hwWYLJIy65DKbVuXlBPTb/nwvVpTO6dh9zRch+L5ek6JmVAkwsfeTS2p5bMa8VPujsCXYUAVuCDEJU8wg==",
|
|
||||||
"cpu": [
|
|
||||||
"x64"
|
|
||||||
],
|
|
||||||
"optional": true,
|
|
||||||
"os": [
|
|
||||||
"win32"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"node_modules/@neon-rs/cli": {
|
"node_modules/@neon-rs/cli": {
|
||||||
"version": "0.0.160",
|
"version": "0.0.160",
|
||||||
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
@@ -84,16 +84,20 @@
|
|||||||
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
|
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
|
||||||
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
|
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
|
||||||
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
|
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
|
||||||
|
"x86_64-unknown-linux-musl": "@lancedb/vectordb-linux-x64-musl",
|
||||||
|
"aarch64-unknown-linux-musl": "@lancedb/vectordb-linux-arm64-musl",
|
||||||
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc",
|
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc",
|
||||||
"aarch64-pc-windows-msvc": "@lancedb/vectordb-win32-arm64-msvc"
|
"aarch64-pc-windows-msvc": "@lancedb/vectordb-win32-arm64-msvc"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"optionalDependencies": {
|
"optionalDependencies": {
|
||||||
"@lancedb/vectordb-darwin-arm64": "0.13.0",
|
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-darwin-x64": "0.13.0",
|
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
|
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
|
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0",
|
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
|
||||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0"
|
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
|
||||||
|
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0",
|
||||||
|
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-nodejs"
|
name = "lancedb-nodejs"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
version = "0.13.0"
|
version = "0.13.1-beta.0"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
description.workspace = true
|
description.workspace = true
|
||||||
repository.workspace = true
|
repository.workspace = true
|
||||||
|
|||||||
@@ -87,6 +87,12 @@ export interface OptimizeOptions {
|
|||||||
deleteUnverified: boolean;
|
deleteUnverified: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface Version {
|
||||||
|
version: number;
|
||||||
|
timestamp: Date;
|
||||||
|
metadata: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A Table is a collection of Records in a LanceDB Database.
|
* A Table is a collection of Records in a LanceDB Database.
|
||||||
*
|
*
|
||||||
@@ -360,6 +366,11 @@ export abstract class Table {
|
|||||||
*/
|
*/
|
||||||
abstract checkoutLatest(): Promise<void>;
|
abstract checkoutLatest(): Promise<void>;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List all the versions of the table
|
||||||
|
*/
|
||||||
|
abstract listVersions(): Promise<Version[]>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Restore the table to the currently checked out version
|
* Restore the table to the currently checked out version
|
||||||
*
|
*
|
||||||
@@ -659,6 +670,14 @@ export class LocalTable extends Table {
|
|||||||
await this.inner.checkoutLatest();
|
await this.inner.checkoutLatest();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async listVersions(): Promise<Version[]> {
|
||||||
|
return (await this.inner.listVersions()).map((version) => ({
|
||||||
|
version: version.version,
|
||||||
|
timestamp: new Date(version.timestamp / 1000),
|
||||||
|
metadata: version.metadata,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
async restore(): Promise<void> {
|
async restore(): Promise<void> {
|
||||||
await this.inner.restore();
|
await this.inner.restore();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
3
nodejs/npm/linux-arm64-musl/README.md
Normal file
3
nodejs/npm/linux-arm64-musl/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# `@lancedb/lancedb-linux-arm64-musl`
|
||||||
|
|
||||||
|
This is the **aarch64-unknown-linux-musl** binary for `@lancedb/lancedb`
|
||||||
13
nodejs/npm/linux-arm64-musl/package.json
Normal file
13
nodejs/npm/linux-arm64-musl/package.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||||
|
"version": "0.13.1-beta.0",
|
||||||
|
"os": ["linux"],
|
||||||
|
"cpu": ["arm64"],
|
||||||
|
"main": "lancedb.linux-arm64-musl.node",
|
||||||
|
"files": ["lancedb.linux-arm64-musl.node"],
|
||||||
|
"license": "Apache 2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 18"
|
||||||
|
},
|
||||||
|
"libc": ["musl"]
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
3
nodejs/npm/linux-x64-musl/README.md
Normal file
3
nodejs/npm/linux-x64-musl/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# `@lancedb/lancedb-linux-x64-musl`
|
||||||
|
|
||||||
|
This is the **x86_64-unknown-linux-musl** binary for `@lancedb/lancedb`
|
||||||
13
nodejs/npm/linux-x64-musl/package.json
Normal file
13
nodejs/npm/linux-x64-musl/package.json
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||||
|
"version": "0.13.1-beta.0",
|
||||||
|
"os": ["linux"],
|
||||||
|
"cpu": ["x64"],
|
||||||
|
"main": "lancedb.linux-x64-musl.node",
|
||||||
|
"files": ["lancedb.linux-x64-musl.node"],
|
||||||
|
"license": "Apache 2.0",
|
||||||
|
"engines": {
|
||||||
|
"node": ">= 18"
|
||||||
|
},
|
||||||
|
"libc": ["musl"]
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"os": [
|
"os": [
|
||||||
"win32"
|
"win32"
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
"vector database",
|
"vector database",
|
||||||
"ann"
|
"ann"
|
||||||
],
|
],
|
||||||
"version": "0.13.0",
|
"version": "0.13.1-beta.0",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"exports": {
|
"exports": {
|
||||||
".": "./dist/index.js",
|
".": "./dist/index.js",
|
||||||
@@ -24,10 +24,12 @@
|
|||||||
"triples": {
|
"triples": {
|
||||||
"defaults": false,
|
"defaults": false,
|
||||||
"additional": [
|
"additional": [
|
||||||
"aarch64-apple-darwin",
|
|
||||||
"aarch64-unknown-linux-gnu",
|
|
||||||
"x86_64-apple-darwin",
|
"x86_64-apple-darwin",
|
||||||
|
"aarch64-apple-darwin",
|
||||||
"x86_64-unknown-linux-gnu",
|
"x86_64-unknown-linux-gnu",
|
||||||
|
"aarch64-unknown-linux-gnu",
|
||||||
|
"x86_64-unknown-linux-musl",
|
||||||
|
"aarch64-unknown-linux-musl",
|
||||||
"x86_64-pc-windows-msvc"
|
"x86_64-pc-windows-msvc"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,8 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use arrow_ipc::writer::FileWriter;
|
use arrow_ipc::writer::FileWriter;
|
||||||
use lancedb::ipc::ipc_file_to_batches;
|
use lancedb::ipc::ipc_file_to_batches;
|
||||||
use lancedb::table::{
|
use lancedb::table::{
|
||||||
@@ -226,6 +228,28 @@ impl Table {
|
|||||||
self.inner_ref()?.checkout_latest().await.default_error()
|
self.inner_ref()?.checkout_latest().await.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(catch_unwind)]
|
||||||
|
pub async fn list_versions(&self) -> napi::Result<Vec<Version>> {
|
||||||
|
self.inner_ref()?
|
||||||
|
.list_versions()
|
||||||
|
.await
|
||||||
|
.map(|versions| {
|
||||||
|
versions
|
||||||
|
.iter()
|
||||||
|
.map(|version| Version {
|
||||||
|
version: version.version as i64,
|
||||||
|
timestamp: version.timestamp.timestamp_micros(),
|
||||||
|
metadata: version
|
||||||
|
.metadata
|
||||||
|
.iter()
|
||||||
|
.map(|(k, v)| (k.clone(), v.clone()))
|
||||||
|
.collect(),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.default_error()
|
||||||
|
}
|
||||||
|
|
||||||
#[napi(catch_unwind)]
|
#[napi(catch_unwind)]
|
||||||
pub async fn restore(&self) -> napi::Result<()> {
|
pub async fn restore(&self) -> napi::Result<()> {
|
||||||
self.inner_ref()?.restore().await.default_error()
|
self.inner_ref()?.restore().await.default_error()
|
||||||
@@ -466,3 +490,10 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi(object)]
|
||||||
|
pub struct Version {
|
||||||
|
pub version: i64,
|
||||||
|
pub timestamp: i64,
|
||||||
|
pub metadata: HashMap<String, String>,
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.16.0"
|
current_version = "0.16.1-beta.0"
|
||||||
parse = """(?x)
|
parse = """(?x)
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.16.0"
|
version = "0.16.1-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -17,11 +17,17 @@ crate-type = ["cdylib"]
|
|||||||
arrow = { version = "52.1", features = ["pyarrow"] }
|
arrow = { version = "52.1", features = ["pyarrow"] }
|
||||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||||
env_logger.workspace = true
|
env_logger.workspace = true
|
||||||
pyo3 = { version = "0.21", features = ["extension-module", "abi3-py38", "gil-refs"] }
|
pyo3 = { version = "0.21", features = [
|
||||||
|
"extension-module",
|
||||||
|
"abi3-py39",
|
||||||
|
"gil-refs"
|
||||||
|
] }
|
||||||
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
|
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
|
||||||
# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
||||||
pyo3-asyncio-0-21 = { version = "0.21.0", features = ["attributes", "tokio-runtime"] }
|
pyo3-asyncio-0-21 = { version = "0.21.0", features = [
|
||||||
|
"attributes",
|
||||||
|
"tokio-runtime"
|
||||||
|
] }
|
||||||
pin-project = "1.1.5"
|
pin-project = "1.1.5"
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
tokio = { version = "1.36.0", features = ["sync"] }
|
tokio = { version = "1.36.0", features = ["sync"] }
|
||||||
@@ -29,14 +35,13 @@ tokio = { version = "1.36.0", features = ["sync"] }
|
|||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
pyo3-build-config = { version = "0.20.3", features = [
|
pyo3-build-config = { version = "0.20.3", features = [
|
||||||
"extension-module",
|
"extension-module",
|
||||||
"abi3-py38",
|
"abi3-py39",
|
||||||
] }
|
] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["default-tls", "remote"]
|
default = ["default-tls", "remote"]
|
||||||
fp16kernels = ["lancedb/fp16kernels"]
|
fp16kernels = ["lancedb/fp16kernels"]
|
||||||
remote = ["lancedb/remote"]
|
remote = ["lancedb/remote"]
|
||||||
|
|
||||||
# TLS
|
# TLS
|
||||||
default-tls = ["lancedb/default-tls"]
|
default-tls = ["lancedb/default-tls"]
|
||||||
native-tls = ["lancedb/native-tls"]
|
native-tls = ["lancedb/native-tls"]
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ name = "lancedb"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"nest-asyncio~=1.0",
|
"nest-asyncio~=1.0",
|
||||||
"pylance==0.19.3b1",
|
"pylance==0.20.0b2",
|
||||||
"tqdm>=4.27.0",
|
"tqdm>=4.27.0",
|
||||||
"pydantic>=1.10",
|
"pydantic>=1.10",
|
||||||
"packaging",
|
"packaging",
|
||||||
@@ -31,7 +31,6 @@ classifiers = [
|
|||||||
"Programming Language :: Python",
|
"Programming Language :: Python",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3 :: Only",
|
"Programming Language :: Python :: 3 :: Only",
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
"Programming Language :: Python :: 3.9",
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.11",
|
||||||
|
|||||||
@@ -83,25 +83,33 @@ class OpenAIEmbeddings(TextEmbeddingFunction):
|
|||||||
"""
|
"""
|
||||||
openai = attempt_import_or_raise("openai")
|
openai = attempt_import_or_raise("openai")
|
||||||
|
|
||||||
|
valid_texts = []
|
||||||
|
valid_indices = []
|
||||||
|
for idx, text in enumerate(texts):
|
||||||
|
if text:
|
||||||
|
valid_texts.append(text)
|
||||||
|
valid_indices.append(idx)
|
||||||
|
|
||||||
# TODO retry, rate limit, token limit
|
# TODO retry, rate limit, token limit
|
||||||
try:
|
try:
|
||||||
if self.name == "text-embedding-ada-002":
|
kwargs = {
|
||||||
rs = self._openai_client.embeddings.create(input=texts, model=self.name)
|
"input": valid_texts,
|
||||||
else:
|
"model": self.name,
|
||||||
kwargs = {
|
}
|
||||||
"input": texts,
|
if self.name != "text-embedding-ada-002":
|
||||||
"model": self.name,
|
kwargs["dimensions"] = self.dim
|
||||||
}
|
|
||||||
if self.dim:
|
rs = self._openai_client.embeddings.create(**kwargs)
|
||||||
kwargs["dimensions"] = self.dim
|
valid_embeddings = {
|
||||||
rs = self._openai_client.embeddings.create(**kwargs)
|
idx: v.embedding for v, idx in zip(rs.data, valid_indices)
|
||||||
|
}
|
||||||
except openai.BadRequestError:
|
except openai.BadRequestError:
|
||||||
logging.exception("Bad request: %s", texts)
|
logging.exception("Bad request: %s", texts)
|
||||||
return [None] * len(texts)
|
return [None] * len(texts)
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.exception("OpenAI embeddings error")
|
logging.exception("OpenAI embeddings error")
|
||||||
raise
|
raise
|
||||||
return [v.embedding for v in rs.data]
|
return [valid_embeddings.get(idx, None) for idx in range(len(texts))]
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _openai_client(self):
|
def _openai_client(self):
|
||||||
|
|||||||
@@ -1,15 +1,5 @@
|
|||||||
# Copyright 2023 LanceDB Developers
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
"""Pydantic (v1 / v2) adapter for LanceDB"""
|
"""Pydantic (v1 / v2) adapter for LanceDB"""
|
||||||
|
|
||||||
@@ -30,6 +20,7 @@ from typing import (
|
|||||||
Type,
|
Type,
|
||||||
Union,
|
Union,
|
||||||
_GenericAlias,
|
_GenericAlias,
|
||||||
|
GenericAlias,
|
||||||
)
|
)
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -75,7 +66,7 @@ def vector(dim: int, value_type: pa.DataType = pa.float32()):
|
|||||||
|
|
||||||
|
|
||||||
def Vector(
|
def Vector(
|
||||||
dim: int, value_type: pa.DataType = pa.float32()
|
dim: int, value_type: pa.DataType = pa.float32(), nullable: bool = True
|
||||||
) -> Type[FixedSizeListMixin]:
|
) -> Type[FixedSizeListMixin]:
|
||||||
"""Pydantic Vector Type.
|
"""Pydantic Vector Type.
|
||||||
|
|
||||||
@@ -88,6 +79,8 @@ def Vector(
|
|||||||
The dimension of the vector.
|
The dimension of the vector.
|
||||||
value_type : pyarrow.DataType, optional
|
value_type : pyarrow.DataType, optional
|
||||||
The value type of the vector, by default pa.float32()
|
The value type of the vector, by default pa.float32()
|
||||||
|
nullable : bool, optional
|
||||||
|
Whether the vector is nullable, by default it is True.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -103,7 +96,7 @@ def Vector(
|
|||||||
>>> assert schema == pa.schema([
|
>>> assert schema == pa.schema([
|
||||||
... pa.field("id", pa.int64(), False),
|
... pa.field("id", pa.int64(), False),
|
||||||
... pa.field("url", pa.utf8(), False),
|
... pa.field("url", pa.utf8(), False),
|
||||||
... pa.field("embeddings", pa.list_(pa.float32(), 768), False)
|
... pa.field("embeddings", pa.list_(pa.float32(), 768))
|
||||||
... ])
|
... ])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -112,6 +105,10 @@ def Vector(
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"FixedSizeList(dim={dim})"
|
return f"FixedSizeList(dim={dim})"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def nullable() -> bool:
|
||||||
|
return nullable
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def dim() -> int:
|
def dim() -> int:
|
||||||
return dim
|
return dim
|
||||||
@@ -205,9 +202,7 @@ else:
|
|||||||
def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
|
def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
|
||||||
"""Convert a Pydantic FieldInfo to Arrow DataType"""
|
"""Convert a Pydantic FieldInfo to Arrow DataType"""
|
||||||
|
|
||||||
if isinstance(field.annotation, _GenericAlias) or (
|
if isinstance(field.annotation, (_GenericAlias, GenericAlias)):
|
||||||
sys.version_info > (3, 9) and isinstance(field.annotation, types.GenericAlias)
|
|
||||||
):
|
|
||||||
origin = field.annotation.__origin__
|
origin = field.annotation.__origin__
|
||||||
args = field.annotation.__args__
|
args = field.annotation.__args__
|
||||||
if origin is list:
|
if origin is list:
|
||||||
@@ -235,7 +230,7 @@ def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
|
|||||||
|
|
||||||
def is_nullable(field: FieldInfo) -> bool:
|
def is_nullable(field: FieldInfo) -> bool:
|
||||||
"""Check if a Pydantic FieldInfo is nullable."""
|
"""Check if a Pydantic FieldInfo is nullable."""
|
||||||
if isinstance(field.annotation, _GenericAlias):
|
if isinstance(field.annotation, (_GenericAlias, GenericAlias)):
|
||||||
origin = field.annotation.__origin__
|
origin = field.annotation.__origin__
|
||||||
args = field.annotation.__args__
|
args = field.annotation.__args__
|
||||||
if origin == Union:
|
if origin == Union:
|
||||||
@@ -246,6 +241,10 @@ def is_nullable(field: FieldInfo) -> bool:
|
|||||||
for typ in args:
|
for typ in args:
|
||||||
if typ is type(None):
|
if typ is type(None):
|
||||||
return True
|
return True
|
||||||
|
elif inspect.isclass(field.annotation) and issubclass(
|
||||||
|
field.annotation, FixedSizeListMixin
|
||||||
|
):
|
||||||
|
return field.annotation.nullable()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -370,11 +370,13 @@ class LanceQueryBuilder(ABC):
|
|||||||
----------
|
----------
|
||||||
limit: int
|
limit: int
|
||||||
The maximum number of results to return.
|
The maximum number of results to return.
|
||||||
By default the query is limited to the first 10.
|
The default query limit is 10 results.
|
||||||
Call this method and pass 0, a negative value,
|
For ANN/KNN queries, you must specify a limit.
|
||||||
or None to remove the limit.
|
Entering 0, a negative number, or None will reset
|
||||||
*WARNING* if you have a large dataset, removing
|
the limit to the default value of 10.
|
||||||
the limit can potentially result in reading a
|
*WARNING* if you have a large dataset, setting
|
||||||
|
the limit to a large number, e.g. the table size,
|
||||||
|
can potentially result in reading a
|
||||||
large amount of data into memory and cause
|
large amount of data into memory and cause
|
||||||
out of memory issues.
|
out of memory issues.
|
||||||
|
|
||||||
|
|||||||
@@ -78,6 +78,10 @@ class RemoteTable(Table):
|
|||||||
self.schema.metadata
|
self.schema.metadata
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def list_versions(self):
|
||||||
|
"""List all versions of the table"""
|
||||||
|
return self._loop.run_until_complete(self._table.list_versions())
|
||||||
|
|
||||||
def to_arrow(self) -> pa.Table:
|
def to_arrow(self) -> pa.Table:
|
||||||
"""to_arrow() is not yet supported on LanceDB cloud."""
|
"""to_arrow() is not yet supported on LanceDB cloud."""
|
||||||
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")
|
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ class CohereReranker(Reranker):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_name: str = "rerank-english-v2.0",
|
model_name: str = "rerank-english-v3.0",
|
||||||
column: str = "text",
|
column: str = "text",
|
||||||
top_n: Union[int, None] = None,
|
top_n: Union[int, None] = None,
|
||||||
return_score="relevance",
|
return_score="relevance",
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import inspect
|
|||||||
import time
|
import time
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import timedelta
|
from datetime import datetime, timedelta
|
||||||
from functools import cached_property
|
from functools import cached_property
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
@@ -1015,15 +1015,36 @@ class Table(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def checkout(self):
|
def checkout(self):
|
||||||
"""
|
"""
|
||||||
TODO comments
|
Checks out a specific version of the Table
|
||||||
|
|
||||||
|
Any read operation on the table will now access the data at the checked out
|
||||||
|
version. As a consequence, calling this method will disable any read consistency
|
||||||
|
interval that was previously set.
|
||||||
|
|
||||||
|
This is a read-only operation that turns the table into a sort of "view"
|
||||||
|
or "detached head". Other table instances will not be affected. To make the
|
||||||
|
change permanent you can use the `[Self::restore]` method.
|
||||||
|
|
||||||
|
Any operation that modifies the table will fail while the table is in a checked
|
||||||
|
out state.
|
||||||
|
|
||||||
|
To return the table to a normal state use `[Self::checkout_latest]`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def checkout_latest(self):
|
def checkout_latest(self):
|
||||||
"""
|
"""
|
||||||
TODO comments
|
Ensures the table is pointing at the latest version
|
||||||
|
|
||||||
|
This can be used to manually update a table when the read_consistency_interval
|
||||||
|
is None
|
||||||
|
It can also be used to undo a `[Self::checkout]` operation
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def list_versions(self):
|
||||||
|
"""List all versions of the table"""
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _dataset_uri(self) -> str:
|
def _dataset_uri(self) -> str:
|
||||||
return _table_uri(self._conn.uri, self.name)
|
return _table_uri(self._conn.uri, self.name)
|
||||||
@@ -2914,6 +2935,19 @@ class AsyncTable:
|
|||||||
"""
|
"""
|
||||||
return await self._inner.version()
|
return await self._inner.version()
|
||||||
|
|
||||||
|
async def list_versions(self):
|
||||||
|
"""
|
||||||
|
List all versions of the table
|
||||||
|
"""
|
||||||
|
versions = await self._inner.list_versions()
|
||||||
|
for v in versions:
|
||||||
|
ts_nanos = v["timestamp"]
|
||||||
|
v["timestamp"] = datetime.fromtimestamp(ts_nanos // 1e9) + timedelta(
|
||||||
|
microseconds=(ts_nanos % 1e9) // 1e3
|
||||||
|
)
|
||||||
|
|
||||||
|
return versions
|
||||||
|
|
||||||
async def checkout(self, version):
|
async def checkout(self, version):
|
||||||
"""
|
"""
|
||||||
Checks out a specific version of the Table
|
Checks out a specific version of the Table
|
||||||
|
|||||||
@@ -11,6 +11,8 @@ from datetime import date, datetime
|
|||||||
from functools import singledispatch
|
from functools import singledispatch
|
||||||
from typing import Tuple, Union, Optional, Any
|
from typing import Tuple, Union, Optional, Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from threading import Lock
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
@@ -314,3 +316,27 @@ def deprecated(func):
|
|||||||
def validate_table_name(name: str):
|
def validate_table_name(name: str):
|
||||||
"""Verify the table name is valid."""
|
"""Verify the table name is valid."""
|
||||||
native_validate_table_name(name)
|
native_validate_table_name(name)
|
||||||
|
|
||||||
|
|
||||||
|
class ConnectionPool:
|
||||||
|
def __init__(self, connection_factory, *, max_size: Optional[int] = None):
|
||||||
|
self.max_size = max_size
|
||||||
|
self._connection_factory = connection_factory
|
||||||
|
self._pool = []
|
||||||
|
self._lock = Lock()
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def connection(self):
|
||||||
|
with self._lock:
|
||||||
|
if self._pool:
|
||||||
|
conn = self._pool.pop()
|
||||||
|
else:
|
||||||
|
conn = self._connection_factory()
|
||||||
|
|
||||||
|
# release the lock before yielding
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
finally:
|
||||||
|
with self._lock:
|
||||||
|
if self.max_size is None or len(self._pool) < self.max_size:
|
||||||
|
self._pool.append(conn)
|
||||||
|
|||||||
@@ -90,10 +90,13 @@ def test_embedding_with_bad_results(tmp_path):
|
|||||||
self, texts: Union[List[str], np.ndarray]
|
self, texts: Union[List[str], np.ndarray]
|
||||||
) -> list[Union[np.array, None]]:
|
) -> list[Union[np.array, None]]:
|
||||||
# Return None, which is bad if field is non-nullable
|
# Return None, which is bad if field is non-nullable
|
||||||
return [
|
a = [
|
||||||
None if i % 2 == 0 else np.random.randn(self.ndims())
|
np.full(self.ndims(), np.nan)
|
||||||
|
if i % 2 == 0
|
||||||
|
else np.random.randn(self.ndims())
|
||||||
for i in range(len(texts))
|
for i in range(len(texts))
|
||||||
]
|
]
|
||||||
|
return a
|
||||||
|
|
||||||
db = lancedb.connect(tmp_path)
|
db = lancedb.connect(tmp_path)
|
||||||
registry = EmbeddingFunctionRegistry.get_instance()
|
registry = EmbeddingFunctionRegistry.get_instance()
|
||||||
|
|||||||
@@ -1,15 +1,6 @@
|
|||||||
# Copyright (c) 2023. LanceDB Developers
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import importlib
|
import importlib
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
@@ -17,6 +8,7 @@ import os
|
|||||||
import lancedb
|
import lancedb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import pyarrow as pa
|
||||||
import pytest
|
import pytest
|
||||||
from lancedb.embeddings import get_registry
|
from lancedb.embeddings import get_registry
|
||||||
from lancedb.pydantic import LanceModel, Vector
|
from lancedb.pydantic import LanceModel, Vector
|
||||||
@@ -444,6 +436,30 @@ def test_watsonx_embedding(tmp_path):
|
|||||||
assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world"
|
assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
os.environ.get("OPENAI_API_KEY") is None, reason="OPENAI_API_KEY not set"
|
||||||
|
)
|
||||||
|
def test_openai_with_empty_strs(tmp_path):
|
||||||
|
model = get_registry().get("openai").create(max_retries=0)
|
||||||
|
|
||||||
|
class TextModel(LanceModel):
|
||||||
|
text: str = model.SourceField()
|
||||||
|
vector: Vector(model.ndims()) = model.VectorField()
|
||||||
|
|
||||||
|
df = pd.DataFrame({"text": ["hello world", ""]})
|
||||||
|
db = lancedb.connect(tmp_path)
|
||||||
|
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||||
|
|
||||||
|
tbl.add(df, on_bad_vectors="skip")
|
||||||
|
tb = tbl.to_arrow()
|
||||||
|
assert tb.schema.field_by_name("vector").type == pa.list_(
|
||||||
|
pa.float32(), model.ndims()
|
||||||
|
)
|
||||||
|
assert len(tb) == 2
|
||||||
|
assert tb["vector"].is_null().to_pylist() == [False, True]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
importlib.util.find_spec("ollama") is None, reason="Ollama not installed"
|
importlib.util.find_spec("ollama") is None, reason="Ollama not installed"
|
||||||
|
|||||||
@@ -1,16 +1,5 @@
|
|||||||
# Copyright 2023 LanceDB Developers
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
#
|
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
@@ -172,6 +161,26 @@ def test_pydantic_to_arrow_py38():
|
|||||||
assert schema == expect_schema
|
assert schema == expect_schema
|
||||||
|
|
||||||
|
|
||||||
|
def test_nullable_vector():
|
||||||
|
class NullableModel(pydantic.BaseModel):
|
||||||
|
vec: Vector(16, nullable=False)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(NullableModel)
|
||||||
|
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), False)])
|
||||||
|
|
||||||
|
class DefaultModel(pydantic.BaseModel):
|
||||||
|
vec: Vector(16)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(DefaultModel)
|
||||||
|
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), True)])
|
||||||
|
|
||||||
|
class NotNullableModel(pydantic.BaseModel):
|
||||||
|
vec: Vector(16)
|
||||||
|
|
||||||
|
schema = pydantic_to_schema(NotNullableModel)
|
||||||
|
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), True)])
|
||||||
|
|
||||||
|
|
||||||
def test_fixed_size_list_field():
|
def test_fixed_size_list_field():
|
||||||
class TestModel(pydantic.BaseModel):
|
class TestModel(pydantic.BaseModel):
|
||||||
vec: Vector(16)
|
vec: Vector(16)
|
||||||
@@ -192,7 +201,7 @@ def test_fixed_size_list_field():
|
|||||||
schema = pydantic_to_schema(TestModel)
|
schema = pydantic_to_schema(TestModel)
|
||||||
assert schema == pa.schema(
|
assert schema == pa.schema(
|
||||||
[
|
[
|
||||||
pa.field("vec", pa.list_(pa.float32(), 16), False),
|
pa.field("vec", pa.list_(pa.float32(), 16)),
|
||||||
pa.field("li", pa.list_(pa.int64()), False),
|
pa.field("li", pa.list_(pa.int64()), False),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -6,13 +6,16 @@ from datetime import timedelta
|
|||||||
import http.server
|
import http.server
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
import lancedb
|
import lancedb
|
||||||
from lancedb.conftest import MockTextEmbeddingFunction
|
from lancedb.conftest import MockTextEmbeddingFunction
|
||||||
from lancedb.remote import ClientConfig
|
from lancedb.remote import ClientConfig
|
||||||
|
from lancedb.util import ConnectionPool
|
||||||
from lancedb.remote.errors import HttpError, RetryError
|
from lancedb.remote.errors import HttpError, RetryError
|
||||||
|
import lancedb.util
|
||||||
import pytest
|
import pytest
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
@@ -55,6 +58,34 @@ def mock_lancedb_connection(handler):
|
|||||||
handle.join()
|
handle.join()
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def mock_lancedb_connection_pool(handler):
|
||||||
|
with http.server.HTTPServer(
|
||||||
|
("localhost", 8080), make_mock_http_handler(handler)
|
||||||
|
) as server:
|
||||||
|
handle = threading.Thread(target=server.serve_forever)
|
||||||
|
handle.start()
|
||||||
|
|
||||||
|
def conn_factory():
|
||||||
|
lancedb.connect(
|
||||||
|
"db://dev",
|
||||||
|
api_key="fake",
|
||||||
|
host_override="http://localhost:8080",
|
||||||
|
client_config={
|
||||||
|
"retry_config": {"retries": 2},
|
||||||
|
"timeout_config": {
|
||||||
|
"connect_timeout": 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield ConnectionPool(conn_factory)
|
||||||
|
finally:
|
||||||
|
server.shutdown()
|
||||||
|
handle.join()
|
||||||
|
|
||||||
|
|
||||||
@contextlib.asynccontextmanager
|
@contextlib.asynccontextmanager
|
||||||
async def mock_lancedb_connection_async(handler):
|
async def mock_lancedb_connection_async(handler):
|
||||||
with http.server.HTTPServer(
|
with http.server.HTTPServer(
|
||||||
@@ -103,6 +134,47 @@ async def test_async_remote_db():
|
|||||||
assert table_names == []
|
assert table_names == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_async_checkout():
|
||||||
|
def handler(request):
|
||||||
|
if request.path == "/v1/table/test/describe/":
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
response = json.dumps({"version": 42, "schema": {"fields": []}})
|
||||||
|
request.wfile.write(response.encode())
|
||||||
|
return
|
||||||
|
|
||||||
|
content_len = int(request.headers.get("Content-Length"))
|
||||||
|
body = request.rfile.read(content_len)
|
||||||
|
body = json.loads(body)
|
||||||
|
|
||||||
|
print("body is", body)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
if body["version"] == 1:
|
||||||
|
count = 100
|
||||||
|
elif body["version"] == 2:
|
||||||
|
count = 200
|
||||||
|
elif body["version"] is None:
|
||||||
|
count = 300
|
||||||
|
|
||||||
|
request.send_response(200)
|
||||||
|
request.send_header("Content-Type", "application/json")
|
||||||
|
request.end_headers()
|
||||||
|
request.wfile.write(json.dumps(count).encode())
|
||||||
|
|
||||||
|
async with mock_lancedb_connection_async(handler) as db:
|
||||||
|
table = await db.open_table("test")
|
||||||
|
assert await table.count_rows() == 300
|
||||||
|
await table.checkout(1)
|
||||||
|
assert await table.count_rows() == 100
|
||||||
|
await table.checkout(2)
|
||||||
|
assert await table.count_rows() == 200
|
||||||
|
await table.checkout_latest()
|
||||||
|
assert await table.count_rows() == 300
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_http_error():
|
async def test_http_error():
|
||||||
request_id_holder = {"request_id": None}
|
request_id_holder = {"request_id": None}
|
||||||
@@ -146,8 +218,7 @@ async def test_retry_error():
|
|||||||
assert cause.status_code == 429
|
assert cause.status_code == 429
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
def http_handler(query_handler):
|
||||||
def query_test_table(query_handler):
|
|
||||||
def handler(request):
|
def handler(request):
|
||||||
if request.path == "/v1/table/test/describe/":
|
if request.path == "/v1/table/test/describe/":
|
||||||
request.send_response(200)
|
request.send_response(200)
|
||||||
@@ -171,7 +242,12 @@ def query_test_table(query_handler):
|
|||||||
request.send_response(404)
|
request.send_response(404)
|
||||||
request.end_headers()
|
request.end_headers()
|
||||||
|
|
||||||
with mock_lancedb_connection(handler) as db:
|
return handler
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def query_test_table(connection_ctx_mgr):
|
||||||
|
with connection_ctx_mgr as db:
|
||||||
assert repr(db) == "RemoteConnect(name=dev)"
|
assert repr(db) == "RemoteConnect(name=dev)"
|
||||||
table = db.open_table("test")
|
table = db.open_table("test")
|
||||||
assert repr(table) == "RemoteTable(dev.test)"
|
assert repr(table) == "RemoteTable(dev.test)"
|
||||||
@@ -179,6 +255,7 @@ def query_test_table(query_handler):
|
|||||||
|
|
||||||
|
|
||||||
def test_query_sync_minimal():
|
def test_query_sync_minimal():
|
||||||
|
@http_handler
|
||||||
def handler(body):
|
def handler(body):
|
||||||
assert body == {
|
assert body == {
|
||||||
"distance_type": "l2",
|
"distance_type": "l2",
|
||||||
@@ -188,28 +265,75 @@ def test_query_sync_minimal():
|
|||||||
"ef": None,
|
"ef": None,
|
||||||
"vector": [1.0, 2.0, 3.0],
|
"vector": [1.0, 2.0, 3.0],
|
||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
|
|
||||||
with query_test_table(handler) as table:
|
with query_test_table(mock_lancedb_connection(handler)) as table:
|
||||||
|
data = table.search([1, 2, 3]).to_list()
|
||||||
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
|
assert data == expected
|
||||||
|
|
||||||
|
with query_test_table(mock_lancedb_connection_pool(handler).connection()) as table:
|
||||||
data = table.search([1, 2, 3]).to_list()
|
data = table.search([1, 2, 3]).to_list()
|
||||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
assert data == expected
|
assert data == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_sync_minimal_threaded():
|
||||||
|
num_query = 0
|
||||||
|
|
||||||
|
@http_handler
|
||||||
|
def handler(body):
|
||||||
|
assert body == {
|
||||||
|
"distance_type": "l2",
|
||||||
|
"k": 10,
|
||||||
|
"prefilter": False,
|
||||||
|
"refine_factor": None,
|
||||||
|
"ef": None,
|
||||||
|
"vector": [1.0, 2.0, 3.0],
|
||||||
|
"nprobes": 20,
|
||||||
|
"version": None,
|
||||||
|
}
|
||||||
|
nonlocal num_query
|
||||||
|
num_query += 1
|
||||||
|
|
||||||
|
return pa.table({"id": [1, 2, 3]})
|
||||||
|
|
||||||
|
pool = mock_lancedb_connection_pool(handler)
|
||||||
|
|
||||||
|
def _query(i):
|
||||||
|
with query_test_table(pool.connection()) as table:
|
||||||
|
data = table.search([1, 2, 3]).to_list()
|
||||||
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
|
assert data == expected
|
||||||
|
|
||||||
|
with ThreadPoolExecutor as exec:
|
||||||
|
exec.map(_query, range(1000))
|
||||||
|
|
||||||
|
assert num_query == 1000
|
||||||
|
|
||||||
|
|
||||||
def test_query_sync_empty_query():
|
def test_query_sync_empty_query():
|
||||||
|
@http_handler
|
||||||
def handler(body):
|
def handler(body):
|
||||||
assert body == {
|
assert body == {
|
||||||
"k": 10,
|
"k": 10,
|
||||||
"filter": "true",
|
"filter": "true",
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"columns": ["id"],
|
"columns": ["id"],
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
|
|
||||||
with query_test_table(handler) as table:
|
with query_test_table(mock_lancedb_connection(handler)) as table:
|
||||||
|
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
||||||
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
|
assert data == expected
|
||||||
|
|
||||||
|
with query_test_table(mock_lancedb_connection_pool(handler).connection()) as table:
|
||||||
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
||||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||||
assert data == expected
|
assert data == expected
|
||||||
@@ -230,6 +354,7 @@ def test_query_sync_maximal():
|
|||||||
"vector_column": "vector2",
|
"vector_column": "vector2",
|
||||||
"fast_search": True,
|
"fast_search": True,
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
||||||
@@ -268,6 +393,7 @@ def test_query_sync_fts():
|
|||||||
},
|
},
|
||||||
"k": 10,
|
"k": 10,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
@@ -284,6 +410,7 @@ def test_query_sync_fts():
|
|||||||
"k": 42,
|
"k": 42,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
return pa.table({"id": [1, 2, 3]})
|
return pa.table({"id": [1, 2, 3]})
|
||||||
@@ -309,6 +436,7 @@ def test_query_sync_hybrid():
|
|||||||
"k": 42,
|
"k": 42,
|
||||||
"vector": [],
|
"vector": [],
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
|
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
|
||||||
else:
|
else:
|
||||||
@@ -322,6 +450,7 @@ def test_query_sync_hybrid():
|
|||||||
"nprobes": 20,
|
"nprobes": 20,
|
||||||
"ef": None,
|
"ef": None,
|
||||||
"with_row_id": True,
|
"with_row_id": True,
|
||||||
|
"version": None,
|
||||||
}
|
}
|
||||||
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})
|
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use lancedb::table::{
|
|||||||
use pyo3::{
|
use pyo3::{
|
||||||
exceptions::{PyRuntimeError, PyValueError},
|
exceptions::{PyRuntimeError, PyValueError},
|
||||||
pyclass, pymethods,
|
pyclass, pymethods,
|
||||||
types::{PyDict, PyDictMethods, PyString},
|
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
|
||||||
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
||||||
};
|
};
|
||||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||||
@@ -246,6 +246,33 @@ impl Table {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn list_versions(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||||
|
let inner = self_.inner_ref()?.clone();
|
||||||
|
future_into_py(self_.py(), async move {
|
||||||
|
let versions = inner.list_versions().await.infer_error()?;
|
||||||
|
let versions_as_dict = Python::with_gil(|py| {
|
||||||
|
versions
|
||||||
|
.iter()
|
||||||
|
.map(|v| {
|
||||||
|
let dict = PyDict::new_bound(py);
|
||||||
|
dict.set_item("version", v.version).unwrap();
|
||||||
|
dict.set_item(
|
||||||
|
"timestamp",
|
||||||
|
v.timestamp.timestamp_nanos_opt().unwrap_or_default(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let tup: Vec<(&String, &String)> = v.metadata.iter().collect();
|
||||||
|
dict.set_item("metadata", tup.into_py_dict(py)).unwrap();
|
||||||
|
dict.to_object(py)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(versions_as_dict)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
|
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
|
||||||
let inner = self_.inner_ref()?.clone();
|
let inner = self_.inner_ref()?.clone();
|
||||||
future_into_py(self_.py(), async move {
|
future_into_py(self_.py(), async move {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.13.0"
|
version = "0.13.1-beta.0"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.13.0"
|
version = "0.13.1-beta.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ use http::header::CONTENT_TYPE;
|
|||||||
use http::StatusCode;
|
use http::StatusCode;
|
||||||
use lance::arrow::json::JsonSchema;
|
use lance::arrow::json::JsonSchema;
|
||||||
use lance::dataset::scanner::DatasetRecordBatchStream;
|
use lance::dataset::scanner::DatasetRecordBatchStream;
|
||||||
use lance::dataset::{ColumnAlteration, NewColumnTransform};
|
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
||||||
use lance_datafusion::exec::OneShotExec;
|
use lance_datafusion::exec::OneShotExec;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::RwLock;
|
||||||
@@ -363,6 +363,34 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
|||||||
message: "restore is not supported on LanceDB cloud.".into(),
|
message: "restore is not supported on LanceDB cloud.".into(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
let request = self
|
||||||
|
.client
|
||||||
|
.post(&format!("/v1/table/{}/version/list/", self.name));
|
||||||
|
let (request_id, response) = self.client.send(request, true).await?;
|
||||||
|
let response = self.check_table_response(&request_id, response).await?;
|
||||||
|
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
struct ListVersionsResponse {
|
||||||
|
versions: Vec<Version>,
|
||||||
|
}
|
||||||
|
|
||||||
|
let body = response.text().await.err_to_http(request_id.clone())?;
|
||||||
|
let body: ListVersionsResponse =
|
||||||
|
serde_json::from_str(&body).map_err(|err| Error::Http {
|
||||||
|
source: format!(
|
||||||
|
"Failed to parse list_versions response: {}, body: {}",
|
||||||
|
err, body
|
||||||
|
)
|
||||||
|
.into(),
|
||||||
|
request_id,
|
||||||
|
status_code: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(body.versions)
|
||||||
|
}
|
||||||
|
|
||||||
async fn schema(&self) -> Result<SchemaRef> {
|
async fn schema(&self) -> Result<SchemaRef> {
|
||||||
let schema = self.describe().await?.schema;
|
let schema = self.describe().await?.schema;
|
||||||
Ok(Arc::new(schema.try_into()?))
|
Ok(Arc::new(schema.try_into()?))
|
||||||
@@ -775,6 +803,7 @@ mod tests {
|
|||||||
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
||||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
||||||
use arrow_schema::{DataType, Field, Schema};
|
use arrow_schema::{DataType, Field, Schema};
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||||
use lance_index::scalar::FullTextSearchQuery;
|
use lance_index::scalar::FullTextSearchQuery;
|
||||||
use reqwest::Body;
|
use reqwest::Body;
|
||||||
@@ -1489,6 +1518,51 @@ mod tests {
|
|||||||
assert_eq!(indices, expected);
|
assert_eq!(indices, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_list_versions() {
|
||||||
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
|
assert_eq!(request.method(), "POST");
|
||||||
|
assert_eq!(request.url().path(), "/v1/table/my_table/version/list/");
|
||||||
|
|
||||||
|
let version1 = lance::dataset::Version {
|
||||||
|
version: 1,
|
||||||
|
timestamp: "2024-01-01T00:00:00Z".parse().unwrap(),
|
||||||
|
metadata: Default::default(),
|
||||||
|
};
|
||||||
|
let version2 = lance::dataset::Version {
|
||||||
|
version: 2,
|
||||||
|
timestamp: "2024-02-01T00:00:00Z".parse().unwrap(),
|
||||||
|
metadata: Default::default(),
|
||||||
|
};
|
||||||
|
let response_body = serde_json::json!({
|
||||||
|
"versions": [
|
||||||
|
version1,
|
||||||
|
version2,
|
||||||
|
]
|
||||||
|
});
|
||||||
|
let response_body = serde_json::to_string(&response_body).unwrap();
|
||||||
|
|
||||||
|
http::Response::builder()
|
||||||
|
.status(200)
|
||||||
|
.body(response_body)
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
let versions = table.list_versions().await.unwrap();
|
||||||
|
assert_eq!(versions.len(), 2);
|
||||||
|
assert_eq!(versions[0].version, 1);
|
||||||
|
assert_eq!(
|
||||||
|
versions[0].timestamp,
|
||||||
|
"2024-01-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
|
||||||
|
);
|
||||||
|
assert_eq!(versions[1].version, 2);
|
||||||
|
assert_eq!(
|
||||||
|
versions[1].timestamp,
|
||||||
|
"2024-02-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
|
||||||
|
);
|
||||||
|
// assert_eq!(versions, expected);
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_index_stats() {
|
async fn test_index_stats() {
|
||||||
let table = Table::new_with_handler("my_table", |request| {
|
let table = Table::new_with_handler("my_table", |request| {
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ pub use lance::dataset::ColumnAlteration;
|
|||||||
pub use lance::dataset::NewColumnTransform;
|
pub use lance::dataset::NewColumnTransform;
|
||||||
pub use lance::dataset::ReadParams;
|
pub use lance::dataset::ReadParams;
|
||||||
use lance::dataset::{
|
use lance::dataset::{
|
||||||
Dataset, UpdateBuilder as LanceUpdateBuilder, WhenMatched, WriteMode, WriteParams,
|
Dataset, UpdateBuilder as LanceUpdateBuilder, Version, WhenMatched, WriteMode, WriteParams,
|
||||||
};
|
};
|
||||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||||
use lance::io::WrappingObjectStore;
|
use lance::io::WrappingObjectStore;
|
||||||
@@ -426,6 +426,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
|
|||||||
async fn checkout(&self, version: u64) -> Result<()>;
|
async fn checkout(&self, version: u64) -> Result<()>;
|
||||||
async fn checkout_latest(&self) -> Result<()>;
|
async fn checkout_latest(&self) -> Result<()>;
|
||||||
async fn restore(&self) -> Result<()>;
|
async fn restore(&self) -> Result<()>;
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>>;
|
||||||
async fn table_definition(&self) -> Result<TableDefinition>;
|
async fn table_definition(&self) -> Result<TableDefinition>;
|
||||||
fn dataset_uri(&self) -> &str;
|
fn dataset_uri(&self) -> &str;
|
||||||
}
|
}
|
||||||
@@ -955,6 +956,11 @@ impl Table {
|
|||||||
self.inner.restore().await
|
self.inner.restore().await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// List all the versions of the table
|
||||||
|
pub async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
self.inner.list_versions().await
|
||||||
|
}
|
||||||
|
|
||||||
/// List all indices that have been created with [`Self::create_index`]
|
/// List all indices that have been created with [`Self::create_index`]
|
||||||
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
||||||
self.inner.list_indices().await
|
self.inner.list_indices().await
|
||||||
@@ -1319,7 +1325,7 @@ impl NativeTable {
|
|||||||
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
||||||
Ok(indices
|
Ok(indices
|
||||||
.iter()
|
.iter()
|
||||||
.map(|i| VectorIndex::new_from_format(&mf, i))
|
.map(|i| VectorIndex::new_from_format(&(mf.0), i))
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1707,6 +1713,10 @@ impl TableInternal for NativeTable {
|
|||||||
self.dataset.reload().await
|
self.dataset.reload().await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||||
|
Ok(self.dataset.get().await?.versions().await?)
|
||||||
|
}
|
||||||
|
|
||||||
async fn restore(&self) -> Result<()> {
|
async fn restore(&self) -> Result<()> {
|
||||||
let version =
|
let version =
|
||||||
self.dataset
|
self.dataset
|
||||||
|
|||||||
Reference in New Issue
Block a user