Compare commits

..

5 Commits

Author SHA1 Message Date
albertlockett
b7fed59278 linter and clippy 2024-11-21 07:11:35 -05:00
albertlockett
60ad82b6ad add tests for rust 2024-11-21 06:58:51 -05:00
albertlockett
134258308c it passes version for all read calls 2024-11-20 11:46:04 -05:00
albertlockett
d36334d565 fixed for describe 2024-11-20 10:14:39 -05:00
albertlockett
131c01d702 feat: support for checkout and checkout_latest in remote rust and python sdks 2024-11-19 17:24:28 -05:00
50 changed files with 201 additions and 757 deletions

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.13.1-beta.0"
current_version = "0.13.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.
@@ -87,16 +87,6 @@ glob = "node/package.json"
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-linux-arm64-musl\": \"{new_version}\""
search = "\"@lancedb/vectordb-linux-arm64-musl\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-linux-x64-musl\": \"{new_version}\""
search = "\"@lancedb/vectordb-linux-x64-musl\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""

View File

@@ -31,9 +31,6 @@ rustflags = [
[target.x86_64-unknown-linux-gnu]
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
[target.x86_64-unknown-linux-musl]
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=-crt-static,+avx2,+fma,+f16c"]
[target.aarch64-apple-darwin]
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]

View File

@@ -101,7 +101,7 @@ jobs:
path: |
nodejs/dist/*.node
node-linux-gnu:
node-linux:
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
runs-on: ${{ matrix.config.runner }}
# Only runs on tags that matches the make-release action
@@ -137,63 +137,11 @@ jobs:
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: node-native-linux-${{ matrix.config.arch }}-gnu
name: node-native-linux-${{ matrix.config.arch }}
path: |
node/dist/lancedb-vectordb-linux*.tgz
node-linux-musl:
name: vectordb (${{ matrix.config.arch}}-unknown-linux-musl)
runs-on: ubuntu-latest
container: alpine:edge
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
strategy:
fail-fast: false
matrix:
config:
- arch: x86_64
- arch: aarch64
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install common dependencies
run: |
apk add protobuf-dev curl clang mold grep npm bash
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
echo "source $HOME/.cargo/env" >> saved_env
echo "export CC=clang" >> saved_env
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
- name: Configure aarch64 build
if: ${{ matrix.config.arch == 'aarch64' }}
run: |
source "$HOME/.cargo/env"
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
curl -sSf $apk_url > apk_list
for pkg in gcc libgcc musl; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
mkdir -p $sysroot_lib
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
cp usr/lib/libgcc_s.so.1 $sysroot_lib
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
echo '!<arch>' > $sysroot_lib/libdl.a
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
echo "export RUSTFLAGS='-Ctarget-cpu=apple-m1 -Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
- name: Build Linux Artifacts
run: |
source ./saved_env
bash ci/manylinux_node/build_vectordb.sh ${{ matrix.config.arch }}
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: node-native-linux-${{ matrix.config.arch }}-musl
path: |
node/dist/lancedb-vectordb-linux*.tgz
nodejs-linux-gnu:
nodejs-linux:
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
runs-on: ${{ matrix.config.runner }}
# Only runs on tags that matches the make-release action
@@ -230,7 +178,7 @@ jobs:
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: nodejs-native-linux-${{ matrix.config.arch }}-gnu
name: nodejs-native-linux-${{ matrix.config.arch }}
path: |
nodejs/dist/*.node
# The generic files are the same in all distros so we just pick
@@ -244,62 +192,6 @@ jobs:
nodejs/dist/*
!nodejs/dist/*.node
nodejs-linux-musl:
name: lancedb (${{ matrix.config.arch}}-unknown-linux-musl
runs-on: ubuntu-latest
container: alpine:edge
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
strategy:
fail-fast: false
matrix:
config:
- arch: x86_64
- arch: aarch64
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install common dependencies
run: |
apk add protobuf-dev curl clang mold grep npm bash openssl-dev openssl-libs-static
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
echo "source $HOME/.cargo/env" >> saved_env
echo "export CC=clang" >> saved_env
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=/usr/include" >> saved_env
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=/usr/lib" >> saved_env
- name: Configure aarch64 build
if: ${{ matrix.config.arch == 'aarch64' }}
run: |
source "$HOME/.cargo/env"
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
curl -sSf $apk_url > apk_list
for pkg in gcc libgcc musl openssl-dev openssl-libs-static; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
mkdir -p $sysroot_lib
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
cp usr/lib/libgcc_s.so.1 $sysroot_lib
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
echo '!<arch>' > $sysroot_lib/libdl.a
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
echo "export RUSTFLAGS='-Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=$(realpath usr/include)" >> saved_env
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=$(realpath usr/lib)" >> saved_env
- name: Build Linux Artifacts
run: |
source ./saved_env
bash ci/manylinux_node/build_lancedb.sh ${{ matrix.config.arch }}
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: nodejs-native-linux-${{ matrix.config.arch }}-musl
path: |
nodejs/dist/*.node
node-windows:
name: vectordb ${{ matrix.target }}
runs-on: windows-2022
@@ -568,7 +460,7 @@ jobs:
release:
name: vectordb NPM Publish
needs: [node, node-macos, node-linux-gnu, node-linux-musl, node-windows]
needs: [node, node-macos, node-linux, node-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
@@ -608,7 +500,7 @@ jobs:
release-nodejs:
name: lancedb NPM Publish
needs: [nodejs-macos, nodejs-linux-gnu, nodejs-linux-musl, nodejs-windows]
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')

View File

@@ -21,15 +21,15 @@ categories = ["database-implementations"]
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
[workspace.dependencies]
lance = { "version" = "=0.20.0", "features" = [
lance = { "version" = "=0.19.3", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-index = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-linalg = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-table = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-testing = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-datafusion = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-encoding = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false }
arrow-array = "52.2"

View File

@@ -11,8 +11,7 @@ fi
export OPENSSL_STATIC=1
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
#Alpine doesn't have .bashrc
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
source $HOME/.bashrc
cd nodejs
npm ci

View File

@@ -5,14 +5,13 @@ ARCH=${1:-x86_64}
if [ "$ARCH" = "x86_64" ]; then
export OPENSSL_LIB_DIR=/usr/local/lib64/
else
else
export OPENSSL_LIB_DIR=/usr/local/lib/
fi
export OPENSSL_STATIC=1
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
#Alpine doesn't have .bashrc
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
source $HOME/.bashrc
cd node
npm ci

View File

@@ -138,7 +138,6 @@ nav:
- Jina Reranker: reranking/jina.md
- OpenAI Reranker: reranking/openai.md
- AnswerDotAi Rerankers: reranking/answerdotai.md
- Voyage AI Rerankers: reranking/voyageai.md
- Building Custom Rerankers: reranking/custom_reranker.md
- Example: notebooks/lancedb_reranking.ipynb
- Filtering: sql.md
@@ -166,7 +165,6 @@ nav:
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
- Multimodal Embedding Functions:
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md

View File

@@ -277,15 +277,7 @@ Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` t
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. The number should be a factor of the vector dimension. Because
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
!!! note
if `num_sub_vectors` is set to be greater than the vector dimension, you will see errors like `attempt to divide by zero`
### How to choose `m` and `ef_construction` for `IVF_HNSW_*` index?
`m` determines the number of connections a new node establishes with its closest neighbors upon entering the graph. Typically, `m` falls within the range of 5 to 48. Lower `m` values are suitable for low-dimensional data or scenarios where recall is less critical. Conversely, higher `m` values are beneficial for high-dimensional data or when high recall is required. In essence, a larger `m` results in a denser graph with increased connectivity, but at the expense of higher memory consumption.
`ef_construction` balances build speed and accuracy. Higher values increase accuracy but slow down the build process. A typical range is 150 to 300. For good search results, a minimum value of 100 is recommended. In most cases, setting this value above 500 offers no additional benefit. Ensure that `ef_construction` is always set to a value equal to or greater than `ef` in the search phase
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.

View File

@@ -57,13 +57,6 @@ Then the greedy search routine operates as follows:
## Usage
There are three key parameters to set when constructing an HNSW index:
* `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
* `m`: The number of neighbors to select for each vector in the HNSW graph.
* `ef_construction`: The number of candidates to evaluate during the construction of the HNSW graph.
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
### Construct index

View File

@@ -58,10 +58,8 @@ In Python, the index can be created as follows:
# Make sure you have enough data in the table for an effective training step
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
```
!!! note
`num_partitions`=256 and `num_sub_vectors`=96 does not work for every dataset. Those values needs to be adjusted for your particular dataset.
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See [here](../ann_indexes.md/#how-to-choose-num_partitions-and-num_sub_vectors-for-ivf_pq-index) for best practices on choosing these parameters.
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See the [FAQs](#faq) below for best practices on choosing these parameters.
### Query the index

View File

@@ -114,45 +114,12 @@ table.create_fts_index("text",
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
This can be invoked via the familiar `where` syntax.
With pre-filtering:
This can be invoked via the familiar `where` syntax:
=== "Python"
```python
table.search("puppy").limit(10).where("meta='foo'", prefilte=True).to_list()
```
=== "TypeScript"
```typescript
await tbl
.search("puppy")
.select(["id", "doc"])
.limit(10)
.where("meta='foo'")
.prefilter(true)
.toArray();
```
=== "Rust"
```rust
table
.query()
.full_text_search(FullTextSearchQuery::new("puppy".to_owned()))
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
.limit(10)
.only_if("meta='foo'")
.execute()
.await?;
```
With post-filtering:
=== "Python"
```python
table.search("puppy").limit(10).where("meta='foo'", prefilte=False).to_list()
table.search("puppy").limit(10).where("meta='foo'").to_list()
```
=== "TypeScript"
@@ -163,7 +130,6 @@ With post-filtering:
.select(["id", "doc"])
.limit(10)
.where("meta='foo'")
.prefilter(false)
.toArray();
```
@@ -174,7 +140,6 @@ With post-filtering:
.query()
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
.postfilter()
.limit(10)
.only_if("meta='foo'")
.execute()
@@ -224,6 +189,3 @@ This can make the query more efficient, especially when the table is large and t
tbl.add(more_data).execute().await?;
tbl.optimize(OptimizeAction::All).execute().await?;
```
!!! note
New data added after creating the FTS index will appear in search results while incremental index is still progress, but with increased latency due to a flat search on the unindexed portion. LanceDB Cloud automates this merging process, minimizing the impact on search speed.

View File

@@ -153,7 +153,9 @@ table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=
## Current limitations
1. New data added after creating the FTS index will appear in search results, but with increased latency due to a flat search on the unindexed portion. Re-indexing with `create_fts_index` will reduce latency. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
1. Currently we do not yet support incremental writes.
If you add data after FTS index creation, it won't be reflected
in search results until you do a full reindex.
2. We currently only support local filesystem paths for the FTS index.
This is a tantivy limitation. We've implemented an object store plugin

View File

@@ -6,9 +6,6 @@ This re-ranker uses the [Cohere](https://cohere.ai/) API to rerank the search re
!!! note
Supported Query Types: Hybrid, Vector, FTS
```shell
pip install cohere
```
```python
import numpy

View File

@@ -8,7 +8,7 @@
<parent>
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.13.1-beta.0</version>
<version>0.13.0-final.0</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<groupId>com.lancedb</groupId>
<artifactId>lancedb-parent</artifactId>
<version>0.13.1-beta.0</version>
<version>0.13.0-final.0</version>
<packaging>pom</packaging>
<name>LanceDB Parent</name>

78
node/package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "vectordb",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "vectordb",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"cpu": [
"x64",
"arm64"
@@ -52,14 +52,12 @@
"uuid": "^9.0.0"
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.13.0",
"@lancedb/vectordb-darwin-x64": "0.13.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0"
},
"peerDependencies": {
"@apache-arrow/ts": "^14.0.2",
@@ -329,6 +327,66 @@
"@jridgewell/sourcemap-codec": "^1.4.10"
}
},
"node_modules/@lancedb/vectordb-darwin-arm64": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.13.0.tgz",
"integrity": "sha512-8hdcjkRmgrdQYf1jN+DyZae40LIv8UUfnWy70Uid5qy63sSvRW/+MvIdqIPFr9QlLUXmpyyQuX0y3bZhUR99cQ==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-darwin-x64": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.13.0.tgz",
"integrity": "sha512-fWzAY4l5SQtNfMYh80v+M66ugZHhdxbkpk5mNEv6Zsug3DL6kRj3Uv31/i0wgzY6F5G3LUlbjZerN+eTnDLwOw==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"darwin"
]
},
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.13.0.tgz",
"integrity": "sha512-ltwAT9baOSuR5YiGykQXPC8/HGYF13vpI47qxhP9yfgiz9pA8EUn8p8YrBRzq7J4DIZ4b8JSVDXQnMIqEtB4Kg==",
"cpu": [
"arm64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.13.0.tgz",
"integrity": "sha512-MiT/RBlMPGGRh7BX+MXwRuNiiUnKmuDcHH8nm88IH28T7TQxXIbA9w6UpSg5m9f3DgKQI2K8oLi29oKIB8ZwDQ==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"linux"
]
},
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
"version": "0.13.0",
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.13.0.tgz",
"integrity": "sha512-SovP/hwWYLJIy65DKbVuXlBPTb/nwvVpTO6dh9zRch+L5ek6JmVAkwsfeTS2p5bMa8VPujsCXYUAVuCDEJU8wg==",
"cpu": [
"x64"
],
"optional": true,
"os": [
"win32"
]
},
"node_modules/@neon-rs/cli": {
"version": "0.0.160",
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",

View File

@@ -1,6 +1,6 @@
{
"name": "vectordb",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"description": " Serverless, low-latency vector database for AI applications",
"main": "dist/index.js",
"types": "dist/index.d.ts",
@@ -84,20 +84,16 @@
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
"x86_64-unknown-linux-musl": "@lancedb/vectordb-linux-x64-musl",
"aarch64-unknown-linux-musl": "@lancedb/vectordb-linux-arm64-musl",
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc",
"aarch64-pc-windows-msvc": "@lancedb/vectordb-win32-arm64-msvc"
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0"
"@lancedb/vectordb-darwin-arm64": "0.13.0",
"@lancedb/vectordb-darwin-x64": "0.13.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0"
}
}

View File

@@ -1,7 +1,7 @@
[package]
name = "lancedb-nodejs"
edition.workspace = true
version = "0.13.1-beta.0"
version = "0.13.0"
license.workspace = true
description.workspace = true
repository.workspace = true

View File

@@ -87,12 +87,6 @@ export interface OptimizeOptions {
deleteUnverified: boolean;
}
export interface Version {
version: number;
timestamp: Date;
metadata: Record<string, string>;
}
/**
* A Table is a collection of Records in a LanceDB Database.
*
@@ -366,11 +360,6 @@ export abstract class Table {
*/
abstract checkoutLatest(): Promise<void>;
/**
* List all the versions of the table
*/
abstract listVersions(): Promise<Version[]>;
/**
* Restore the table to the currently checked out version
*
@@ -670,14 +659,6 @@ export class LocalTable extends Table {
await this.inner.checkoutLatest();
}
async listVersions(): Promise<Version[]> {
return (await this.inner.listVersions()).map((version) => ({
version: version.version,
timestamp: new Date(version.timestamp / 1000),
metadata: version.metadata,
}));
}
async restore(): Promise<void> {
await this.inner.restore();
}

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-arm64",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"os": ["darwin"],
"cpu": ["arm64"],
"main": "lancedb.darwin-arm64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-darwin-x64",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"os": ["darwin"],
"cpu": ["x64"],
"main": "lancedb.darwin-x64.node",

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-arm64-gnu",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-gnu.node",

View File

@@ -1,3 +0,0 @@
# `@lancedb/lancedb-linux-arm64-musl`
This is the **aarch64-unknown-linux-musl** binary for `@lancedb/lancedb`

View File

@@ -1,13 +0,0 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.13.1-beta.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",
"files": ["lancedb.linux-arm64-musl.node"],
"license": "Apache 2.0",
"engines": {
"node": ">= 18"
},
"libc": ["musl"]
}

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-linux-x64-gnu",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-gnu.node",

View File

@@ -1,3 +0,0 @@
# `@lancedb/lancedb-linux-x64-musl`
This is the **x86_64-unknown-linux-musl** binary for `@lancedb/lancedb`

View File

@@ -1,13 +0,0 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.13.1-beta.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",
"files": ["lancedb.linux-x64-musl.node"],
"license": "Apache 2.0",
"engines": {
"node": ">= 18"
},
"libc": ["musl"]
}

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-arm64-msvc",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"os": [
"win32"
],

View File

@@ -1,6 +1,6 @@
{
"name": "@lancedb/lancedb-win32-x64-msvc",
"version": "0.13.1-beta.0",
"version": "0.13.0",
"os": ["win32"],
"cpu": ["x64"],
"main": "lancedb.win32-x64-msvc.node",

View File

@@ -10,7 +10,7 @@
"vector database",
"ann"
],
"version": "0.13.1-beta.0",
"version": "0.13.0",
"main": "dist/index.js",
"exports": {
".": "./dist/index.js",
@@ -24,12 +24,10 @@
"triples": {
"defaults": false,
"additional": [
"x86_64-apple-darwin",
"aarch64-apple-darwin",
"x86_64-unknown-linux-gnu",
"aarch64-unknown-linux-gnu",
"x86_64-unknown-linux-musl",
"aarch64-unknown-linux-musl",
"x86_64-apple-darwin",
"x86_64-unknown-linux-gnu",
"x86_64-pc-windows-msvc"
]
}

View File

@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use arrow_ipc::writer::FileWriter;
use lancedb::ipc::ipc_file_to_batches;
use lancedb::table::{
@@ -228,28 +226,6 @@ impl Table {
self.inner_ref()?.checkout_latest().await.default_error()
}
#[napi(catch_unwind)]
pub async fn list_versions(&self) -> napi::Result<Vec<Version>> {
self.inner_ref()?
.list_versions()
.await
.map(|versions| {
versions
.iter()
.map(|version| Version {
version: version.version as i64,
timestamp: version.timestamp.timestamp_micros(),
metadata: version
.metadata
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect(),
})
.collect()
})
.default_error()
}
#[napi(catch_unwind)]
pub async fn restore(&self) -> napi::Result<()> {
self.inner_ref()?.restore().await.default_error()
@@ -490,10 +466,3 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
}
}
}
#[napi(object)]
pub struct Version {
pub version: i64,
pub timestamp: i64,
pub metadata: HashMap<String, String>,
}

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.16.1-beta.0"
current_version = "0.16.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.16.1-beta.0"
version = "0.16.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true
@@ -17,17 +17,11 @@ crate-type = ["cdylib"]
arrow = { version = "52.1", features = ["pyarrow"] }
lancedb = { path = "../rust/lancedb", default-features = false }
env_logger.workspace = true
pyo3 = { version = "0.21", features = [
"extension-module",
"abi3-py39",
"gil-refs"
] }
pyo3 = { version = "0.21", features = ["extension-module", "abi3-py38", "gil-refs"] }
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
pyo3-asyncio-0-21 = { version = "0.21.0", features = [
"attributes",
"tokio-runtime"
] }
pyo3-asyncio-0-21 = { version = "0.21.0", features = ["attributes", "tokio-runtime"] }
pin-project = "1.1.5"
futures.workspace = true
tokio = { version = "1.36.0", features = ["sync"] }
@@ -35,13 +29,14 @@ tokio = { version = "1.36.0", features = ["sync"] }
[build-dependencies]
pyo3-build-config = { version = "0.20.3", features = [
"extension-module",
"abi3-py39",
"abi3-py38",
] }
[features]
default = ["default-tls", "remote"]
fp16kernels = ["lancedb/fp16kernels"]
remote = ["lancedb/remote"]
# TLS
default-tls = ["lancedb/default-tls"]
native-tls = ["lancedb/native-tls"]

View File

@@ -4,7 +4,7 @@ name = "lancedb"
dependencies = [
"deprecation",
"nest-asyncio~=1.0",
"pylance==0.20.0b2",
"pylance==0.19.3b1",
"tqdm>=4.27.0",
"pydantic>=1.10",
"packaging",
@@ -31,6 +31,7 @@ classifiers = [
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",

View File

@@ -83,33 +83,25 @@ class OpenAIEmbeddings(TextEmbeddingFunction):
"""
openai = attempt_import_or_raise("openai")
valid_texts = []
valid_indices = []
for idx, text in enumerate(texts):
if text:
valid_texts.append(text)
valid_indices.append(idx)
# TODO retry, rate limit, token limit
try:
kwargs = {
"input": valid_texts,
"model": self.name,
}
if self.name != "text-embedding-ada-002":
kwargs["dimensions"] = self.dim
rs = self._openai_client.embeddings.create(**kwargs)
valid_embeddings = {
idx: v.embedding for v, idx in zip(rs.data, valid_indices)
}
if self.name == "text-embedding-ada-002":
rs = self._openai_client.embeddings.create(input=texts, model=self.name)
else:
kwargs = {
"input": texts,
"model": self.name,
}
if self.dim:
kwargs["dimensions"] = self.dim
rs = self._openai_client.embeddings.create(**kwargs)
except openai.BadRequestError:
logging.exception("Bad request: %s", texts)
return [None] * len(texts)
except Exception:
logging.exception("OpenAI embeddings error")
raise
return [valid_embeddings.get(idx, None) for idx in range(len(texts))]
return [v.embedding for v in rs.data]
@cached_property
def _openai_client(self):

View File

@@ -1,5 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pydantic (v1 / v2) adapter for LanceDB"""
@@ -20,7 +30,6 @@ from typing import (
Type,
Union,
_GenericAlias,
GenericAlias,
)
import numpy as np
@@ -66,7 +75,7 @@ def vector(dim: int, value_type: pa.DataType = pa.float32()):
def Vector(
dim: int, value_type: pa.DataType = pa.float32(), nullable: bool = True
dim: int, value_type: pa.DataType = pa.float32()
) -> Type[FixedSizeListMixin]:
"""Pydantic Vector Type.
@@ -79,8 +88,6 @@ def Vector(
The dimension of the vector.
value_type : pyarrow.DataType, optional
The value type of the vector, by default pa.float32()
nullable : bool, optional
Whether the vector is nullable, by default it is True.
Examples
--------
@@ -96,7 +103,7 @@ def Vector(
>>> assert schema == pa.schema([
... pa.field("id", pa.int64(), False),
... pa.field("url", pa.utf8(), False),
... pa.field("embeddings", pa.list_(pa.float32(), 768))
... pa.field("embeddings", pa.list_(pa.float32(), 768), False)
... ])
"""
@@ -105,10 +112,6 @@ def Vector(
def __repr__(self):
return f"FixedSizeList(dim={dim})"
@staticmethod
def nullable() -> bool:
return nullable
@staticmethod
def dim() -> int:
return dim
@@ -202,7 +205,9 @@ else:
def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
"""Convert a Pydantic FieldInfo to Arrow DataType"""
if isinstance(field.annotation, (_GenericAlias, GenericAlias)):
if isinstance(field.annotation, _GenericAlias) or (
sys.version_info > (3, 9) and isinstance(field.annotation, types.GenericAlias)
):
origin = field.annotation.__origin__
args = field.annotation.__args__
if origin is list:
@@ -230,7 +235,7 @@ def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
def is_nullable(field: FieldInfo) -> bool:
"""Check if a Pydantic FieldInfo is nullable."""
if isinstance(field.annotation, (_GenericAlias, GenericAlias)):
if isinstance(field.annotation, _GenericAlias):
origin = field.annotation.__origin__
args = field.annotation.__args__
if origin == Union:
@@ -241,10 +246,6 @@ def is_nullable(field: FieldInfo) -> bool:
for typ in args:
if typ is type(None):
return True
elif inspect.isclass(field.annotation) and issubclass(
field.annotation, FixedSizeListMixin
):
return field.annotation.nullable()
return False

View File

@@ -370,13 +370,11 @@ class LanceQueryBuilder(ABC):
----------
limit: int
The maximum number of results to return.
The default query limit is 10 results.
For ANN/KNN queries, you must specify a limit.
Entering 0, a negative number, or None will reset
the limit to the default value of 10.
*WARNING* if you have a large dataset, setting
the limit to a large number, e.g. the table size,
can potentially result in reading a
By default the query is limited to the first 10.
Call this method and pass 0, a negative value,
or None to remove the limit.
*WARNING* if you have a large dataset, removing
the limit can potentially result in reading a
large amount of data into memory and cause
out of memory issues.

View File

@@ -78,10 +78,6 @@ class RemoteTable(Table):
self.schema.metadata
)
def list_versions(self):
"""List all versions of the table"""
return self._loop.run_until_complete(self._table.list_versions())
def to_arrow(self) -> pa.Table:
"""to_arrow() is not yet supported on LanceDB cloud."""
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")

View File

@@ -41,7 +41,7 @@ class CohereReranker(Reranker):
def __init__(
self,
model_name: str = "rerank-english-v3.0",
model_name: str = "rerank-english-v2.0",
column: str = "text",
top_n: Union[int, None] = None,
return_score="relevance",

View File

@@ -8,7 +8,7 @@ import inspect
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime, timedelta
from datetime import timedelta
from functools import cached_property
from typing import (
TYPE_CHECKING,
@@ -1015,36 +1015,15 @@ class Table(ABC):
@abstractmethod
def checkout(self):
"""
Checks out a specific version of the Table
Any read operation on the table will now access the data at the checked out
version. As a consequence, calling this method will disable any read consistency
interval that was previously set.
This is a read-only operation that turns the table into a sort of "view"
or "detached head". Other table instances will not be affected. To make the
change permanent you can use the `[Self::restore]` method.
Any operation that modifies the table will fail while the table is in a checked
out state.
To return the table to a normal state use `[Self::checkout_latest]`
TODO comments
"""
@abstractmethod
def checkout_latest(self):
"""
Ensures the table is pointing at the latest version
This can be used to manually update a table when the read_consistency_interval
is None
It can also be used to undo a `[Self::checkout]` operation
TODO comments
"""
@abstractmethod
def list_versions(self):
"""List all versions of the table"""
@cached_property
def _dataset_uri(self) -> str:
return _table_uri(self._conn.uri, self.name)
@@ -2935,19 +2914,6 @@ class AsyncTable:
"""
return await self._inner.version()
async def list_versions(self):
"""
List all versions of the table
"""
versions = await self._inner.list_versions()
for v in versions:
ts_nanos = v["timestamp"]
v["timestamp"] = datetime.fromtimestamp(ts_nanos // 1e9) + timedelta(
microseconds=(ts_nanos % 1e9) // 1e3
)
return versions
async def checkout(self, version):
"""
Checks out a specific version of the Table

View File

@@ -11,8 +11,6 @@ from datetime import date, datetime
from functools import singledispatch
from typing import Tuple, Union, Optional, Any
from urllib.parse import urlparse
from threading import Lock
from contextlib import contextmanager
import numpy as np
import pyarrow as pa
@@ -316,27 +314,3 @@ def deprecated(func):
def validate_table_name(name: str):
"""Verify the table name is valid."""
native_validate_table_name(name)
class ConnectionPool:
def __init__(self, connection_factory, *, max_size: Optional[int] = None):
self.max_size = max_size
self._connection_factory = connection_factory
self._pool = []
self._lock = Lock()
@contextmanager
def connection(self):
with self._lock:
if self._pool:
conn = self._pool.pop()
else:
conn = self._connection_factory()
# release the lock before yielding
try:
yield conn
finally:
with self._lock:
if self.max_size is None or len(self._pool) < self.max_size:
self._pool.append(conn)

View File

@@ -90,13 +90,10 @@ def test_embedding_with_bad_results(tmp_path):
self, texts: Union[List[str], np.ndarray]
) -> list[Union[np.array, None]]:
# Return None, which is bad if field is non-nullable
a = [
np.full(self.ndims(), np.nan)
if i % 2 == 0
else np.random.randn(self.ndims())
return [
None if i % 2 == 0 else np.random.randn(self.ndims())
for i in range(len(texts))
]
return a
db = lancedb.connect(tmp_path)
registry = EmbeddingFunctionRegistry.get_instance()

View File

@@ -1,6 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
# Copyright (c) 2023. LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import io
import os
@@ -8,7 +17,6 @@ import os
import lancedb
import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
@@ -436,30 +444,6 @@ def test_watsonx_embedding(tmp_path):
assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world"
@pytest.mark.slow
@pytest.mark.skipif(
os.environ.get("OPENAI_API_KEY") is None, reason="OPENAI_API_KEY not set"
)
def test_openai_with_empty_strs(tmp_path):
model = get_registry().get("openai").create(max_retries=0)
class TextModel(LanceModel):
text: str = model.SourceField()
vector: Vector(model.ndims()) = model.VectorField()
df = pd.DataFrame({"text": ["hello world", ""]})
db = lancedb.connect(tmp_path)
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
tbl.add(df, on_bad_vectors="skip")
tb = tbl.to_arrow()
assert tb.schema.field_by_name("vector").type == pa.list_(
pa.float32(), model.ndims()
)
assert len(tb) == 2
assert tb["vector"].is_null().to_pylist() == [False, True]
@pytest.mark.slow
@pytest.mark.skipif(
importlib.util.find_spec("ollama") is None, reason="Ollama not installed"

View File

@@ -1,5 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import sys
@@ -161,26 +172,6 @@ def test_pydantic_to_arrow_py38():
assert schema == expect_schema
def test_nullable_vector():
class NullableModel(pydantic.BaseModel):
vec: Vector(16, nullable=False)
schema = pydantic_to_schema(NullableModel)
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), False)])
class DefaultModel(pydantic.BaseModel):
vec: Vector(16)
schema = pydantic_to_schema(DefaultModel)
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), True)])
class NotNullableModel(pydantic.BaseModel):
vec: Vector(16)
schema = pydantic_to_schema(NotNullableModel)
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), True)])
def test_fixed_size_list_field():
class TestModel(pydantic.BaseModel):
vec: Vector(16)
@@ -201,7 +192,7 @@ def test_fixed_size_list_field():
schema = pydantic_to_schema(TestModel)
assert schema == pa.schema(
[
pa.field("vec", pa.list_(pa.float32(), 16)),
pa.field("vec", pa.list_(pa.float32(), 16), False),
pa.field("li", pa.list_(pa.int64()), False),
]
)

View File

@@ -6,16 +6,13 @@ from datetime import timedelta
import http.server
import json
import threading
from concurrent.futures import ThreadPoolExecutor
from unittest.mock import MagicMock
import uuid
import lancedb
from lancedb.conftest import MockTextEmbeddingFunction
from lancedb.remote import ClientConfig
from lancedb.util import ConnectionPool
from lancedb.remote.errors import HttpError, RetryError
import lancedb.util
import pytest
import pyarrow as pa
@@ -58,34 +55,6 @@ def mock_lancedb_connection(handler):
handle.join()
@contextlib.contextmanager
def mock_lancedb_connection_pool(handler):
with http.server.HTTPServer(
("localhost", 8080), make_mock_http_handler(handler)
) as server:
handle = threading.Thread(target=server.serve_forever)
handle.start()
def conn_factory():
lancedb.connect(
"db://dev",
api_key="fake",
host_override="http://localhost:8080",
client_config={
"retry_config": {"retries": 2},
"timeout_config": {
"connect_timeout": 1,
},
},
)
try:
yield ConnectionPool(conn_factory)
finally:
server.shutdown()
handle.join()
@contextlib.asynccontextmanager
async def mock_lancedb_connection_async(handler):
with http.server.HTTPServer(
@@ -134,47 +103,6 @@ async def test_async_remote_db():
assert table_names == []
@pytest.mark.asyncio
async def test_async_checkout():
def handler(request):
if request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
response = json.dumps({"version": 42, "schema": {"fields": []}})
request.wfile.write(response.encode())
return
content_len = int(request.headers.get("Content-Length"))
body = request.rfile.read(content_len)
body = json.loads(body)
print("body is", body)
count = 0
if body["version"] == 1:
count = 100
elif body["version"] == 2:
count = 200
elif body["version"] is None:
count = 300
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(json.dumps(count).encode())
async with mock_lancedb_connection_async(handler) as db:
table = await db.open_table("test")
assert await table.count_rows() == 300
await table.checkout(1)
assert await table.count_rows() == 100
await table.checkout(2)
assert await table.count_rows() == 200
await table.checkout_latest()
assert await table.count_rows() == 300
@pytest.mark.asyncio
async def test_http_error():
request_id_holder = {"request_id": None}
@@ -218,7 +146,8 @@ async def test_retry_error():
assert cause.status_code == 429
def http_handler(query_handler):
@contextlib.contextmanager
def query_test_table(query_handler):
def handler(request):
if request.path == "/v1/table/test/describe/":
request.send_response(200)
@@ -242,12 +171,7 @@ def http_handler(query_handler):
request.send_response(404)
request.end_headers()
return handler
@contextlib.contextmanager
def query_test_table(connection_ctx_mgr):
with connection_ctx_mgr as db:
with mock_lancedb_connection(handler) as db:
assert repr(db) == "RemoteConnect(name=dev)"
table = db.open_table("test")
assert repr(table) == "RemoteTable(dev.test)"
@@ -255,7 +179,6 @@ def query_test_table(connection_ctx_mgr):
def test_query_sync_minimal():
@http_handler
def handler(body):
assert body == {
"distance_type": "l2",
@@ -265,75 +188,28 @@ def test_query_sync_minimal():
"ef": None,
"vector": [1.0, 2.0, 3.0],
"nprobes": 20,
"version": None,
}
return pa.table({"id": [1, 2, 3]})
with query_test_table(mock_lancedb_connection(handler)) as table:
with query_test_table(handler) as table:
data = table.search([1, 2, 3]).to_list()
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
assert data == expected
with query_test_table(mock_lancedb_connection_pool(handler).connection()) as table:
data = table.search([1, 2, 3]).to_list()
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
assert data == expected
def test_query_sync_minimal_threaded():
num_query = 0
@http_handler
def handler(body):
assert body == {
"distance_type": "l2",
"k": 10,
"prefilter": False,
"refine_factor": None,
"ef": None,
"vector": [1.0, 2.0, 3.0],
"nprobes": 20,
"version": None,
}
nonlocal num_query
num_query += 1
return pa.table({"id": [1, 2, 3]})
pool = mock_lancedb_connection_pool(handler)
def _query(i):
with query_test_table(pool.connection()) as table:
data = table.search([1, 2, 3]).to_list()
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
assert data == expected
with ThreadPoolExecutor as exec:
exec.map(_query, range(1000))
assert num_query == 1000
def test_query_sync_empty_query():
@http_handler
def handler(body):
assert body == {
"k": 10,
"filter": "true",
"vector": [],
"columns": ["id"],
"version": None,
}
return pa.table({"id": [1, 2, 3]})
with query_test_table(mock_lancedb_connection(handler)) as table:
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
assert data == expected
with query_test_table(mock_lancedb_connection_pool(handler).connection()) as table:
with query_test_table(handler) as table:
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
assert data == expected
@@ -354,7 +230,6 @@ def test_query_sync_maximal():
"vector_column": "vector2",
"fast_search": True,
"with_row_id": True,
"version": None,
}
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
@@ -393,7 +268,6 @@ def test_query_sync_fts():
},
"k": 10,
"vector": [],
"version": None,
}
return pa.table({"id": [1, 2, 3]})
@@ -410,7 +284,6 @@ def test_query_sync_fts():
"k": 42,
"vector": [],
"with_row_id": True,
"version": None,
}
return pa.table({"id": [1, 2, 3]})
@@ -436,7 +309,6 @@ def test_query_sync_hybrid():
"k": 42,
"vector": [],
"with_row_id": True,
"version": None,
}
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
else:
@@ -450,7 +322,6 @@ def test_query_sync_hybrid():
"nprobes": 20,
"ef": None,
"with_row_id": True,
"version": None,
}
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})

View File

@@ -8,7 +8,7 @@ use lancedb::table::{
use pyo3::{
exceptions::{PyRuntimeError, PyValueError},
pyclass, pymethods,
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
types::{PyDict, PyDictMethods, PyString},
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
};
use pyo3_asyncio_0_21::tokio::future_into_py;
@@ -246,33 +246,6 @@ impl Table {
)
}
pub fn list_versions(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
let versions = inner.list_versions().await.infer_error()?;
let versions_as_dict = Python::with_gil(|py| {
versions
.iter()
.map(|v| {
let dict = PyDict::new_bound(py);
dict.set_item("version", v.version).unwrap();
dict.set_item(
"timestamp",
v.timestamp.timestamp_nanos_opt().unwrap_or_default(),
)
.unwrap();
let tup: Vec<(&String, &String)> = v.metadata.iter().collect();
dict.set_item("metadata", tup.into_py_dict(py)).unwrap();
dict.to_object(py)
})
.collect::<Vec<_>>()
});
Ok(versions_as_dict)
})
}
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-node"
version = "0.13.1-beta.0"
version = "0.13.0"
description = "Serverless, low-latency vector database for AI applications"
license.workspace = true
edition.workspace = true

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb"
version = "0.13.1-beta.0"
version = "0.13.0"
edition.workspace = true
description = "LanceDB: A serverless, low-latency vector database for AI applications"
license.workspace = true

View File

@@ -19,7 +19,7 @@ use http::header::CONTENT_TYPE;
use http::StatusCode;
use lance::arrow::json::JsonSchema;
use lance::dataset::scanner::DatasetRecordBatchStream;
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
use lance::dataset::{ColumnAlteration, NewColumnTransform};
use lance_datafusion::exec::OneShotExec;
use serde::{Deserialize, Serialize};
use tokio::sync::RwLock;
@@ -363,34 +363,6 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
message: "restore is not supported on LanceDB cloud.".into(),
})
}
async fn list_versions(&self) -> Result<Vec<Version>> {
let request = self
.client
.post(&format!("/v1/table/{}/version/list/", self.name));
let (request_id, response) = self.client.send(request, true).await?;
let response = self.check_table_response(&request_id, response).await?;
#[derive(Deserialize)]
struct ListVersionsResponse {
versions: Vec<Version>,
}
let body = response.text().await.err_to_http(request_id.clone())?;
let body: ListVersionsResponse =
serde_json::from_str(&body).map_err(|err| Error::Http {
source: format!(
"Failed to parse list_versions response: {}, body: {}",
err, body
)
.into(),
request_id,
status_code: None,
})?;
Ok(body.versions)
}
async fn schema(&self) -> Result<SchemaRef> {
let schema = self.describe().await?.schema;
Ok(Arc::new(schema.try_into()?))
@@ -803,7 +775,6 @@ mod tests {
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
use arrow_schema::{DataType, Field, Schema};
use chrono::{DateTime, Utc};
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
use lance_index::scalar::FullTextSearchQuery;
use reqwest::Body;
@@ -1518,51 +1489,6 @@ mod tests {
assert_eq!(indices, expected);
}
#[tokio::test]
async fn test_list_versions() {
let table = Table::new_with_handler("my_table", |request| {
assert_eq!(request.method(), "POST");
assert_eq!(request.url().path(), "/v1/table/my_table/version/list/");
let version1 = lance::dataset::Version {
version: 1,
timestamp: "2024-01-01T00:00:00Z".parse().unwrap(),
metadata: Default::default(),
};
let version2 = lance::dataset::Version {
version: 2,
timestamp: "2024-02-01T00:00:00Z".parse().unwrap(),
metadata: Default::default(),
};
let response_body = serde_json::json!({
"versions": [
version1,
version2,
]
});
let response_body = serde_json::to_string(&response_body).unwrap();
http::Response::builder()
.status(200)
.body(response_body)
.unwrap()
});
let versions = table.list_versions().await.unwrap();
assert_eq!(versions.len(), 2);
assert_eq!(versions[0].version, 1);
assert_eq!(
versions[0].timestamp,
"2024-01-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
);
assert_eq!(versions[1].version, 2);
assert_eq!(
versions[1].timestamp,
"2024-02-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
);
// assert_eq!(versions, expected);
}
#[tokio::test]
async fn test_index_stats() {
let table = Table::new_with_handler("my_table", |request| {

View File

@@ -37,7 +37,7 @@ pub use lance::dataset::ColumnAlteration;
pub use lance::dataset::NewColumnTransform;
pub use lance::dataset::ReadParams;
use lance::dataset::{
Dataset, UpdateBuilder as LanceUpdateBuilder, Version, WhenMatched, WriteMode, WriteParams,
Dataset, UpdateBuilder as LanceUpdateBuilder, WhenMatched, WriteMode, WriteParams,
};
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
use lance::io::WrappingObjectStore;
@@ -426,7 +426,6 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
async fn checkout(&self, version: u64) -> Result<()>;
async fn checkout_latest(&self) -> Result<()>;
async fn restore(&self) -> Result<()>;
async fn list_versions(&self) -> Result<Vec<Version>>;
async fn table_definition(&self) -> Result<TableDefinition>;
fn dataset_uri(&self) -> &str;
}
@@ -956,11 +955,6 @@ impl Table {
self.inner.restore().await
}
/// List all the versions of the table
pub async fn list_versions(&self) -> Result<Vec<Version>> {
self.inner.list_versions().await
}
/// List all indices that have been created with [`Self::create_index`]
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
self.inner.list_indices().await
@@ -1325,7 +1319,7 @@ impl NativeTable {
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
Ok(indices
.iter()
.map(|i| VectorIndex::new_from_format(&(mf.0), i))
.map(|i| VectorIndex::new_from_format(&mf, i))
.collect())
}
@@ -1713,10 +1707,6 @@ impl TableInternal for NativeTable {
self.dataset.reload().await
}
async fn list_versions(&self) -> Result<Vec<Version>> {
Ok(self.dataset.get().await?.versions().await?)
}
async fn restore(&self) -> Result<()> {
let version =
self.dataset