mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 22:59:57 +00:00
Compare commits
5 Commits
rmeng/pool
...
remote-ver
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b7fed59278 | ||
|
|
60ad82b6ad | ||
|
|
134258308c | ||
|
|
d36334d565 | ||
|
|
131c01d702 |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.13.1-beta.0"
|
||||
current_version = "0.13.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
@@ -87,16 +87,6 @@ glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
|
||||
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-linux-arm64-musl\": \"{new_version}\""
|
||||
search = "\"@lancedb/vectordb-linux-arm64-musl\": \"{current_version}\""
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-linux-x64-musl\": \"{new_version}\""
|
||||
search = "\"@lancedb/vectordb-linux-x64-musl\": \"{current_version}\""
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
glob = "node/package.json"
|
||||
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""
|
||||
|
||||
@@ -31,9 +31,6 @@ rustflags = [
|
||||
[target.x86_64-unknown-linux-gnu]
|
||||
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
|
||||
|
||||
[target.x86_64-unknown-linux-musl]
|
||||
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=-crt-static,+avx2,+fma,+f16c"]
|
||||
|
||||
[target.aarch64-apple-darwin]
|
||||
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]
|
||||
|
||||
|
||||
120
.github/workflows/npm-publish.yml
vendored
120
.github/workflows/npm-publish.yml
vendored
@@ -101,7 +101,7 @@ jobs:
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
|
||||
node-linux-gnu:
|
||||
node-linux:
|
||||
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
# Only runs on tags that matches the make-release action
|
||||
@@ -137,63 +137,11 @@ jobs:
|
||||
- name: Upload Linux Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: node-native-linux-${{ matrix.config.arch }}-gnu
|
||||
name: node-native-linux-${{ matrix.config.arch }}
|
||||
path: |
|
||||
node/dist/lancedb-vectordb-linux*.tgz
|
||||
|
||||
node-linux-musl:
|
||||
name: vectordb (${{ matrix.config.arch}}-unknown-linux-musl)
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine:edge
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- arch: x86_64
|
||||
- arch: aarch64
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install common dependencies
|
||||
run: |
|
||||
apk add protobuf-dev curl clang mold grep npm bash
|
||||
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
|
||||
echo "source $HOME/.cargo/env" >> saved_env
|
||||
echo "export CC=clang" >> saved_env
|
||||
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
|
||||
- name: Configure aarch64 build
|
||||
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||
run: |
|
||||
source "$HOME/.cargo/env"
|
||||
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
|
||||
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
|
||||
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
|
||||
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
|
||||
curl -sSf $apk_url > apk_list
|
||||
for pkg in gcc libgcc musl; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
|
||||
mkdir -p $sysroot_lib
|
||||
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
|
||||
cp usr/lib/libgcc_s.so.1 $sysroot_lib
|
||||
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
|
||||
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
|
||||
echo '!<arch>' > $sysroot_lib/libdl.a
|
||||
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
|
||||
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
|
||||
echo "export RUSTFLAGS='-Ctarget-cpu=apple-m1 -Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
|
||||
- name: Build Linux Artifacts
|
||||
run: |
|
||||
source ./saved_env
|
||||
bash ci/manylinux_node/build_vectordb.sh ${{ matrix.config.arch }}
|
||||
- name: Upload Linux Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: node-native-linux-${{ matrix.config.arch }}-musl
|
||||
path: |
|
||||
node/dist/lancedb-vectordb-linux*.tgz
|
||||
|
||||
nodejs-linux-gnu:
|
||||
nodejs-linux:
|
||||
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
# Only runs on tags that matches the make-release action
|
||||
@@ -230,7 +178,7 @@ jobs:
|
||||
- name: Upload Linux Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: nodejs-native-linux-${{ matrix.config.arch }}-gnu
|
||||
name: nodejs-native-linux-${{ matrix.config.arch }}
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
# The generic files are the same in all distros so we just pick
|
||||
@@ -244,62 +192,6 @@ jobs:
|
||||
nodejs/dist/*
|
||||
!nodejs/dist/*.node
|
||||
|
||||
nodejs-linux-musl:
|
||||
name: lancedb (${{ matrix.config.arch}}-unknown-linux-musl
|
||||
runs-on: ubuntu-latest
|
||||
container: alpine:edge
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
config:
|
||||
- arch: x86_64
|
||||
- arch: aarch64
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Install common dependencies
|
||||
run: |
|
||||
apk add protobuf-dev curl clang mold grep npm bash openssl-dev openssl-libs-static
|
||||
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
|
||||
echo "source $HOME/.cargo/env" >> saved_env
|
||||
echo "export CC=clang" >> saved_env
|
||||
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
|
||||
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=/usr/include" >> saved_env
|
||||
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=/usr/lib" >> saved_env
|
||||
- name: Configure aarch64 build
|
||||
if: ${{ matrix.config.arch == 'aarch64' }}
|
||||
run: |
|
||||
source "$HOME/.cargo/env"
|
||||
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
|
||||
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
|
||||
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
|
||||
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
|
||||
curl -sSf $apk_url > apk_list
|
||||
for pkg in gcc libgcc musl openssl-dev openssl-libs-static; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
|
||||
mkdir -p $sysroot_lib
|
||||
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
|
||||
cp usr/lib/libgcc_s.so.1 $sysroot_lib
|
||||
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
|
||||
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
|
||||
echo '!<arch>' > $sysroot_lib/libdl.a
|
||||
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
|
||||
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
|
||||
echo "export RUSTFLAGS='-Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
|
||||
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=$(realpath usr/include)" >> saved_env
|
||||
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=$(realpath usr/lib)" >> saved_env
|
||||
- name: Build Linux Artifacts
|
||||
run: |
|
||||
source ./saved_env
|
||||
bash ci/manylinux_node/build_lancedb.sh ${{ matrix.config.arch }}
|
||||
- name: Upload Linux Artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: nodejs-native-linux-${{ matrix.config.arch }}-musl
|
||||
path: |
|
||||
nodejs/dist/*.node
|
||||
|
||||
node-windows:
|
||||
name: vectordb ${{ matrix.target }}
|
||||
runs-on: windows-2022
|
||||
@@ -568,7 +460,7 @@ jobs:
|
||||
|
||||
release:
|
||||
name: vectordb NPM Publish
|
||||
needs: [node, node-macos, node-linux-gnu, node-linux-musl, node-windows]
|
||||
needs: [node, node-macos, node-linux, node-windows]
|
||||
runs-on: ubuntu-latest
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
@@ -608,7 +500,7 @@ jobs:
|
||||
|
||||
release-nodejs:
|
||||
name: lancedb NPM Publish
|
||||
needs: [nodejs-macos, nodejs-linux-gnu, nodejs-linux-musl, nodejs-windows]
|
||||
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
|
||||
runs-on: ubuntu-latest
|
||||
# Only runs on tags that matches the make-release action
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
|
||||
16
Cargo.toml
16
Cargo.toml
@@ -21,15 +21,15 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.20.0", "features" = [
|
||||
lance = { "version" = "=0.19.3", "features" = [
|
||||
"dynamodb",
|
||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
|
||||
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
||||
lance-index = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
||||
lance-linalg = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
||||
lance-table = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
||||
lance-testing = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
||||
lance-datafusion = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
||||
lance-encoding = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "52.2", optional = false }
|
||||
arrow-array = "52.2"
|
||||
|
||||
@@ -11,8 +11,7 @@ fi
|
||||
export OPENSSL_STATIC=1
|
||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||
|
||||
#Alpine doesn't have .bashrc
|
||||
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||
source $HOME/.bashrc
|
||||
|
||||
cd nodejs
|
||||
npm ci
|
||||
|
||||
@@ -5,14 +5,13 @@ ARCH=${1:-x86_64}
|
||||
|
||||
if [ "$ARCH" = "x86_64" ]; then
|
||||
export OPENSSL_LIB_DIR=/usr/local/lib64/
|
||||
else
|
||||
else
|
||||
export OPENSSL_LIB_DIR=/usr/local/lib/
|
||||
fi
|
||||
export OPENSSL_STATIC=1
|
||||
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
|
||||
|
||||
#Alpine doesn't have .bashrc
|
||||
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
|
||||
source $HOME/.bashrc
|
||||
|
||||
cd node
|
||||
npm ci
|
||||
|
||||
@@ -138,7 +138,6 @@ nav:
|
||||
- Jina Reranker: reranking/jina.md
|
||||
- OpenAI Reranker: reranking/openai.md
|
||||
- AnswerDotAi Rerankers: reranking/answerdotai.md
|
||||
- Voyage AI Rerankers: reranking/voyageai.md
|
||||
- Building Custom Rerankers: reranking/custom_reranker.md
|
||||
- Example: notebooks/lancedb_reranking.ipynb
|
||||
- Filtering: sql.md
|
||||
@@ -166,7 +165,6 @@ nav:
|
||||
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
|
||||
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
|
||||
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
|
||||
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
|
||||
- Multimodal Embedding Functions:
|
||||
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
|
||||
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
|
||||
|
||||
@@ -277,15 +277,7 @@ Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` t
|
||||
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
|
||||
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
|
||||
|
||||
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. The number should be a factor of the vector dimension. Because
|
||||
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because
|
||||
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
|
||||
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
||||
|
||||
!!! note
|
||||
if `num_sub_vectors` is set to be greater than the vector dimension, you will see errors like `attempt to divide by zero`
|
||||
|
||||
### How to choose `m` and `ef_construction` for `IVF_HNSW_*` index?
|
||||
|
||||
`m` determines the number of connections a new node establishes with its closest neighbors upon entering the graph. Typically, `m` falls within the range of 5 to 48. Lower `m` values are suitable for low-dimensional data or scenarios where recall is less critical. Conversely, higher `m` values are beneficial for high-dimensional data or when high recall is required. In essence, a larger `m` results in a denser graph with increased connectivity, but at the expense of higher memory consumption.
|
||||
|
||||
`ef_construction` balances build speed and accuracy. Higher values increase accuracy but slow down the build process. A typical range is 150 to 300. For good search results, a minimum value of 100 is recommended. In most cases, setting this value above 500 offers no additional benefit. Ensure that `ef_construction` is always set to a value equal to or greater than `ef` in the search phase
|
||||
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
|
||||
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
|
||||
|
||||
@@ -57,13 +57,6 @@ Then the greedy search routine operates as follows:
|
||||
|
||||
## Usage
|
||||
|
||||
There are three key parameters to set when constructing an HNSW index:
|
||||
|
||||
* `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
|
||||
* `m`: The number of neighbors to select for each vector in the HNSW graph.
|
||||
* `ef_construction`: The number of candidates to evaluate during the construction of the HNSW graph.
|
||||
|
||||
|
||||
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
|
||||
|
||||
### Construct index
|
||||
|
||||
@@ -58,10 +58,8 @@ In Python, the index can be created as follows:
|
||||
# Make sure you have enough data in the table for an effective training step
|
||||
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
|
||||
```
|
||||
!!! note
|
||||
`num_partitions`=256 and `num_sub_vectors`=96 does not work for every dataset. Those values needs to be adjusted for your particular dataset.
|
||||
|
||||
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See [here](../ann_indexes.md/#how-to-choose-num_partitions-and-num_sub_vectors-for-ivf_pq-index) for best practices on choosing these parameters.
|
||||
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See the [FAQs](#faq) below for best practices on choosing these parameters.
|
||||
|
||||
|
||||
### Query the index
|
||||
|
||||
@@ -114,45 +114,12 @@ table.create_fts_index("text",
|
||||
|
||||
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
|
||||
|
||||
This can be invoked via the familiar `where` syntax.
|
||||
|
||||
With pre-filtering:
|
||||
This can be invoked via the familiar `where` syntax:
|
||||
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
table.search("puppy").limit(10).where("meta='foo'", prefilte=True).to_list()
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
|
||||
```typescript
|
||||
await tbl
|
||||
.search("puppy")
|
||||
.select(["id", "doc"])
|
||||
.limit(10)
|
||||
.where("meta='foo'")
|
||||
.prefilter(true)
|
||||
.toArray();
|
||||
```
|
||||
|
||||
=== "Rust"
|
||||
|
||||
```rust
|
||||
table
|
||||
.query()
|
||||
.full_text_search(FullTextSearchQuery::new("puppy".to_owned()))
|
||||
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
||||
.limit(10)
|
||||
.only_if("meta='foo'")
|
||||
.execute()
|
||||
.await?;
|
||||
```
|
||||
|
||||
With post-filtering:
|
||||
=== "Python"
|
||||
|
||||
```python
|
||||
table.search("puppy").limit(10).where("meta='foo'", prefilte=False).to_list()
|
||||
table.search("puppy").limit(10).where("meta='foo'").to_list()
|
||||
```
|
||||
|
||||
=== "TypeScript"
|
||||
@@ -163,7 +130,6 @@ With post-filtering:
|
||||
.select(["id", "doc"])
|
||||
.limit(10)
|
||||
.where("meta='foo'")
|
||||
.prefilter(false)
|
||||
.toArray();
|
||||
```
|
||||
|
||||
@@ -174,7 +140,6 @@ With post-filtering:
|
||||
.query()
|
||||
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
|
||||
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
|
||||
.postfilter()
|
||||
.limit(10)
|
||||
.only_if("meta='foo'")
|
||||
.execute()
|
||||
@@ -224,6 +189,3 @@ This can make the query more efficient, especially when the table is large and t
|
||||
tbl.add(more_data).execute().await?;
|
||||
tbl.optimize(OptimizeAction::All).execute().await?;
|
||||
```
|
||||
!!! note
|
||||
|
||||
New data added after creating the FTS index will appear in search results while incremental index is still progress, but with increased latency due to a flat search on the unindexed portion. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
|
||||
@@ -153,7 +153,9 @@ table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=
|
||||
|
||||
## Current limitations
|
||||
|
||||
1. New data added after creating the FTS index will appear in search results, but with increased latency due to a flat search on the unindexed portion. Re-indexing with `create_fts_index` will reduce latency. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
|
||||
1. Currently we do not yet support incremental writes.
|
||||
If you add data after FTS index creation, it won't be reflected
|
||||
in search results until you do a full reindex.
|
||||
|
||||
2. We currently only support local filesystem paths for the FTS index.
|
||||
This is a tantivy limitation. We've implemented an object store plugin
|
||||
|
||||
@@ -6,9 +6,6 @@ This re-ranker uses the [Cohere](https://cohere.ai/) API to rerank the search re
|
||||
!!! note
|
||||
Supported Query Types: Hybrid, Vector, FTS
|
||||
|
||||
```shell
|
||||
pip install cohere
|
||||
```
|
||||
|
||||
```python
|
||||
import numpy
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.13.1-beta.0</version>
|
||||
<version>0.13.0-final.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.13.1-beta.0</version>
|
||||
<version>0.13.0-final.0</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<name>LanceDB Parent</name>
|
||||
|
||||
78
node/package-lock.json
generated
78
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "vectordb",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
@@ -52,14 +52,12 @@
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.13.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.13.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@apache-arrow/ts": "^14.0.2",
|
||||
@@ -329,6 +327,66 @@
|
||||
"@jridgewell/sourcemap-codec": "^1.4.10"
|
||||
}
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-arm64": {
|
||||
"version": "0.13.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-arm64/-/vectordb-darwin-arm64-0.13.0.tgz",
|
||||
"integrity": "sha512-8hdcjkRmgrdQYf1jN+DyZae40LIv8UUfnWy70Uid5qy63sSvRW/+MvIdqIPFr9QlLUXmpyyQuX0y3bZhUR99cQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-darwin-x64": {
|
||||
"version": "0.13.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-darwin-x64/-/vectordb-darwin-x64-0.13.0.tgz",
|
||||
"integrity": "sha512-fWzAY4l5SQtNfMYh80v+M66ugZHhdxbkpk5mNEv6Zsug3DL6kRj3Uv31/i0wgzY6F5G3LUlbjZerN+eTnDLwOw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-arm64-gnu": {
|
||||
"version": "0.13.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-arm64-gnu/-/vectordb-linux-arm64-gnu-0.13.0.tgz",
|
||||
"integrity": "sha512-ltwAT9baOSuR5YiGykQXPC8/HGYF13vpI47qxhP9yfgiz9pA8EUn8p8YrBRzq7J4DIZ4b8JSVDXQnMIqEtB4Kg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-linux-x64-gnu": {
|
||||
"version": "0.13.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-linux-x64-gnu/-/vectordb-linux-x64-gnu-0.13.0.tgz",
|
||||
"integrity": "sha512-MiT/RBlMPGGRh7BX+MXwRuNiiUnKmuDcHH8nm88IH28T7TQxXIbA9w6UpSg5m9f3DgKQI2K8oLi29oKIB8ZwDQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
]
|
||||
},
|
||||
"node_modules/@lancedb/vectordb-win32-x64-msvc": {
|
||||
"version": "0.13.0",
|
||||
"resolved": "https://registry.npmjs.org/@lancedb/vectordb-win32-x64-msvc/-/vectordb-win32-x64-msvc-0.13.0.tgz",
|
||||
"integrity": "sha512-SovP/hwWYLJIy65DKbVuXlBPTb/nwvVpTO6dh9zRch+L5ek6JmVAkwsfeTS2p5bMa8VPujsCXYUAVuCDEJU8wg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
]
|
||||
},
|
||||
"node_modules/@neon-rs/cli": {
|
||||
"version": "0.0.160",
|
||||
"resolved": "https://registry.npmjs.org/@neon-rs/cli/-/cli-0.0.160.tgz",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "vectordb",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"description": " Serverless, low-latency vector database for AI applications",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
@@ -84,20 +84,16 @@
|
||||
"aarch64-apple-darwin": "@lancedb/vectordb-darwin-arm64",
|
||||
"x86_64-unknown-linux-gnu": "@lancedb/vectordb-linux-x64-gnu",
|
||||
"aarch64-unknown-linux-gnu": "@lancedb/vectordb-linux-arm64-gnu",
|
||||
"x86_64-unknown-linux-musl": "@lancedb/vectordb-linux-x64-musl",
|
||||
"aarch64-unknown-linux-musl": "@lancedb/vectordb-linux-arm64-musl",
|
||||
"x86_64-pc-windows-msvc": "@lancedb/vectordb-win32-x64-msvc",
|
||||
"aarch64-pc-windows-msvc": "@lancedb/vectordb-win32-arm64-msvc"
|
||||
}
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@lancedb/vectordb-darwin-x64": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-darwin-arm64": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-linux-x64-musl": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-linux-arm64-musl": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.1-beta.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.1-beta.0"
|
||||
"@lancedb/vectordb-darwin-arm64": "0.13.0",
|
||||
"@lancedb/vectordb-darwin-x64": "0.13.0",
|
||||
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
|
||||
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
|
||||
"@lancedb/vectordb-win32-x64-msvc": "0.13.0",
|
||||
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.13.1-beta.0"
|
||||
version = "0.13.0"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -87,12 +87,6 @@ export interface OptimizeOptions {
|
||||
deleteUnverified: boolean;
|
||||
}
|
||||
|
||||
export interface Version {
|
||||
version: number;
|
||||
timestamp: Date;
|
||||
metadata: Record<string, string>;
|
||||
}
|
||||
|
||||
/**
|
||||
* A Table is a collection of Records in a LanceDB Database.
|
||||
*
|
||||
@@ -366,11 +360,6 @@ export abstract class Table {
|
||||
*/
|
||||
abstract checkoutLatest(): Promise<void>;
|
||||
|
||||
/**
|
||||
* List all the versions of the table
|
||||
*/
|
||||
abstract listVersions(): Promise<Version[]>;
|
||||
|
||||
/**
|
||||
* Restore the table to the currently checked out version
|
||||
*
|
||||
@@ -670,14 +659,6 @@ export class LocalTable extends Table {
|
||||
await this.inner.checkoutLatest();
|
||||
}
|
||||
|
||||
async listVersions(): Promise<Version[]> {
|
||||
return (await this.inner.listVersions()).map((version) => ({
|
||||
version: version.version,
|
||||
timestamp: new Date(version.timestamp / 1000),
|
||||
metadata: version.metadata,
|
||||
}));
|
||||
}
|
||||
|
||||
async restore(): Promise<void> {
|
||||
await this.inner.restore();
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
# `@lancedb/lancedb-linux-arm64-musl`
|
||||
|
||||
This is the **aarch64-unknown-linux-musl** binary for `@lancedb/lancedb`
|
||||
@@ -1,13 +0,0 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.13.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
"files": ["lancedb.linux-arm64-musl.node"],
|
||||
"license": "Apache 2.0",
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
},
|
||||
"libc": ["musl"]
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
# `@lancedb/lancedb-linux-x64-musl`
|
||||
|
||||
This is the **x86_64-unknown-linux-musl** binary for `@lancedb/lancedb`
|
||||
@@ -1,13 +0,0 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.13.1-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
"files": ["lancedb.linux-x64-musl.node"],
|
||||
"license": "Apache 2.0",
|
||||
"engines": {
|
||||
"node": ">= 18"
|
||||
},
|
||||
"libc": ["musl"]
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
"vector database",
|
||||
"ann"
|
||||
],
|
||||
"version": "0.13.1-beta.0",
|
||||
"version": "0.13.0",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
@@ -24,12 +24,10 @@
|
||||
"triples": {
|
||||
"defaults": false,
|
||||
"additional": [
|
||||
"x86_64-apple-darwin",
|
||||
"aarch64-apple-darwin",
|
||||
"x86_64-unknown-linux-gnu",
|
||||
"aarch64-unknown-linux-gnu",
|
||||
"x86_64-unknown-linux-musl",
|
||||
"aarch64-unknown-linux-musl",
|
||||
"x86_64-apple-darwin",
|
||||
"x86_64-unknown-linux-gnu",
|
||||
"x86_64-pc-windows-msvc"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -12,8 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use arrow_ipc::writer::FileWriter;
|
||||
use lancedb::ipc::ipc_file_to_batches;
|
||||
use lancedb::table::{
|
||||
@@ -228,28 +226,6 @@ impl Table {
|
||||
self.inner_ref()?.checkout_latest().await.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn list_versions(&self) -> napi::Result<Vec<Version>> {
|
||||
self.inner_ref()?
|
||||
.list_versions()
|
||||
.await
|
||||
.map(|versions| {
|
||||
versions
|
||||
.iter()
|
||||
.map(|version| Version {
|
||||
version: version.version as i64,
|
||||
timestamp: version.timestamp.timestamp_micros(),
|
||||
metadata: version
|
||||
.metadata
|
||||
.iter()
|
||||
.map(|(k, v)| (k.clone(), v.clone()))
|
||||
.collect(),
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.default_error()
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn restore(&self) -> napi::Result<()> {
|
||||
self.inner_ref()?.restore().await.default_error()
|
||||
@@ -490,10 +466,3 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[napi(object)]
|
||||
pub struct Version {
|
||||
pub version: i64,
|
||||
pub timestamp: i64,
|
||||
pub metadata: HashMap<String, String>,
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.16.1-beta.0"
|
||||
current_version = "0.16.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.16.1-beta.0"
|
||||
version = "0.16.0"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
@@ -17,17 +17,11 @@ crate-type = ["cdylib"]
|
||||
arrow = { version = "52.1", features = ["pyarrow"] }
|
||||
lancedb = { path = "../rust/lancedb", default-features = false }
|
||||
env_logger.workspace = true
|
||||
pyo3 = { version = "0.21", features = [
|
||||
"extension-module",
|
||||
"abi3-py39",
|
||||
"gil-refs"
|
||||
] }
|
||||
pyo3 = { version = "0.21", features = ["extension-module", "abi3-py38", "gil-refs"] }
|
||||
# Using this fork for now: https://github.com/awestlake87/pyo3-asyncio/issues/119
|
||||
# pyo3-asyncio = { version = "0.20", features = ["attributes", "tokio-runtime"] }
|
||||
pyo3-asyncio-0-21 = { version = "0.21.0", features = [
|
||||
"attributes",
|
||||
"tokio-runtime"
|
||||
] }
|
||||
pyo3-asyncio-0-21 = { version = "0.21.0", features = ["attributes", "tokio-runtime"] }
|
||||
|
||||
pin-project = "1.1.5"
|
||||
futures.workspace = true
|
||||
tokio = { version = "1.36.0", features = ["sync"] }
|
||||
@@ -35,13 +29,14 @@ tokio = { version = "1.36.0", features = ["sync"] }
|
||||
[build-dependencies]
|
||||
pyo3-build-config = { version = "0.20.3", features = [
|
||||
"extension-module",
|
||||
"abi3-py39",
|
||||
"abi3-py38",
|
||||
] }
|
||||
|
||||
[features]
|
||||
default = ["default-tls", "remote"]
|
||||
fp16kernels = ["lancedb/fp16kernels"]
|
||||
remote = ["lancedb/remote"]
|
||||
|
||||
# TLS
|
||||
default-tls = ["lancedb/default-tls"]
|
||||
native-tls = ["lancedb/native-tls"]
|
||||
|
||||
@@ -4,7 +4,7 @@ name = "lancedb"
|
||||
dependencies = [
|
||||
"deprecation",
|
||||
"nest-asyncio~=1.0",
|
||||
"pylance==0.20.0b2",
|
||||
"pylance==0.19.3b1",
|
||||
"tqdm>=4.27.0",
|
||||
"pydantic>=1.10",
|
||||
"packaging",
|
||||
@@ -31,6 +31,7 @@ classifiers = [
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
|
||||
@@ -83,33 +83,25 @@ class OpenAIEmbeddings(TextEmbeddingFunction):
|
||||
"""
|
||||
openai = attempt_import_or_raise("openai")
|
||||
|
||||
valid_texts = []
|
||||
valid_indices = []
|
||||
for idx, text in enumerate(texts):
|
||||
if text:
|
||||
valid_texts.append(text)
|
||||
valid_indices.append(idx)
|
||||
|
||||
# TODO retry, rate limit, token limit
|
||||
try:
|
||||
kwargs = {
|
||||
"input": valid_texts,
|
||||
"model": self.name,
|
||||
}
|
||||
if self.name != "text-embedding-ada-002":
|
||||
kwargs["dimensions"] = self.dim
|
||||
|
||||
rs = self._openai_client.embeddings.create(**kwargs)
|
||||
valid_embeddings = {
|
||||
idx: v.embedding for v, idx in zip(rs.data, valid_indices)
|
||||
}
|
||||
if self.name == "text-embedding-ada-002":
|
||||
rs = self._openai_client.embeddings.create(input=texts, model=self.name)
|
||||
else:
|
||||
kwargs = {
|
||||
"input": texts,
|
||||
"model": self.name,
|
||||
}
|
||||
if self.dim:
|
||||
kwargs["dimensions"] = self.dim
|
||||
rs = self._openai_client.embeddings.create(**kwargs)
|
||||
except openai.BadRequestError:
|
||||
logging.exception("Bad request: %s", texts)
|
||||
return [None] * len(texts)
|
||||
except Exception:
|
||||
logging.exception("OpenAI embeddings error")
|
||||
raise
|
||||
return [valid_embeddings.get(idx, None) for idx in range(len(texts))]
|
||||
return [v.embedding for v in rs.data]
|
||||
|
||||
@cached_property
|
||||
def _openai_client(self):
|
||||
|
||||
@@ -1,5 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
# Copyright 2023 LanceDB Developers
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Pydantic (v1 / v2) adapter for LanceDB"""
|
||||
|
||||
@@ -20,7 +30,6 @@ from typing import (
|
||||
Type,
|
||||
Union,
|
||||
_GenericAlias,
|
||||
GenericAlias,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
@@ -66,7 +75,7 @@ def vector(dim: int, value_type: pa.DataType = pa.float32()):
|
||||
|
||||
|
||||
def Vector(
|
||||
dim: int, value_type: pa.DataType = pa.float32(), nullable: bool = True
|
||||
dim: int, value_type: pa.DataType = pa.float32()
|
||||
) -> Type[FixedSizeListMixin]:
|
||||
"""Pydantic Vector Type.
|
||||
|
||||
@@ -79,8 +88,6 @@ def Vector(
|
||||
The dimension of the vector.
|
||||
value_type : pyarrow.DataType, optional
|
||||
The value type of the vector, by default pa.float32()
|
||||
nullable : bool, optional
|
||||
Whether the vector is nullable, by default it is True.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -96,7 +103,7 @@ def Vector(
|
||||
>>> assert schema == pa.schema([
|
||||
... pa.field("id", pa.int64(), False),
|
||||
... pa.field("url", pa.utf8(), False),
|
||||
... pa.field("embeddings", pa.list_(pa.float32(), 768))
|
||||
... pa.field("embeddings", pa.list_(pa.float32(), 768), False)
|
||||
... ])
|
||||
"""
|
||||
|
||||
@@ -105,10 +112,6 @@ def Vector(
|
||||
def __repr__(self):
|
||||
return f"FixedSizeList(dim={dim})"
|
||||
|
||||
@staticmethod
|
||||
def nullable() -> bool:
|
||||
return nullable
|
||||
|
||||
@staticmethod
|
||||
def dim() -> int:
|
||||
return dim
|
||||
@@ -202,7 +205,9 @@ else:
|
||||
def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
|
||||
"""Convert a Pydantic FieldInfo to Arrow DataType"""
|
||||
|
||||
if isinstance(field.annotation, (_GenericAlias, GenericAlias)):
|
||||
if isinstance(field.annotation, _GenericAlias) or (
|
||||
sys.version_info > (3, 9) and isinstance(field.annotation, types.GenericAlias)
|
||||
):
|
||||
origin = field.annotation.__origin__
|
||||
args = field.annotation.__args__
|
||||
if origin is list:
|
||||
@@ -230,7 +235,7 @@ def _pydantic_to_arrow_type(field: FieldInfo) -> pa.DataType:
|
||||
|
||||
def is_nullable(field: FieldInfo) -> bool:
|
||||
"""Check if a Pydantic FieldInfo is nullable."""
|
||||
if isinstance(field.annotation, (_GenericAlias, GenericAlias)):
|
||||
if isinstance(field.annotation, _GenericAlias):
|
||||
origin = field.annotation.__origin__
|
||||
args = field.annotation.__args__
|
||||
if origin == Union:
|
||||
@@ -241,10 +246,6 @@ def is_nullable(field: FieldInfo) -> bool:
|
||||
for typ in args:
|
||||
if typ is type(None):
|
||||
return True
|
||||
elif inspect.isclass(field.annotation) and issubclass(
|
||||
field.annotation, FixedSizeListMixin
|
||||
):
|
||||
return field.annotation.nullable()
|
||||
return False
|
||||
|
||||
|
||||
|
||||
@@ -370,13 +370,11 @@ class LanceQueryBuilder(ABC):
|
||||
----------
|
||||
limit: int
|
||||
The maximum number of results to return.
|
||||
The default query limit is 10 results.
|
||||
For ANN/KNN queries, you must specify a limit.
|
||||
Entering 0, a negative number, or None will reset
|
||||
the limit to the default value of 10.
|
||||
*WARNING* if you have a large dataset, setting
|
||||
the limit to a large number, e.g. the table size,
|
||||
can potentially result in reading a
|
||||
By default the query is limited to the first 10.
|
||||
Call this method and pass 0, a negative value,
|
||||
or None to remove the limit.
|
||||
*WARNING* if you have a large dataset, removing
|
||||
the limit can potentially result in reading a
|
||||
large amount of data into memory and cause
|
||||
out of memory issues.
|
||||
|
||||
|
||||
@@ -78,10 +78,6 @@ class RemoteTable(Table):
|
||||
self.schema.metadata
|
||||
)
|
||||
|
||||
def list_versions(self):
|
||||
"""List all versions of the table"""
|
||||
return self._loop.run_until_complete(self._table.list_versions())
|
||||
|
||||
def to_arrow(self) -> pa.Table:
|
||||
"""to_arrow() is not yet supported on LanceDB cloud."""
|
||||
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")
|
||||
|
||||
@@ -41,7 +41,7 @@ class CohereReranker(Reranker):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "rerank-english-v3.0",
|
||||
model_name: str = "rerank-english-v2.0",
|
||||
column: str = "text",
|
||||
top_n: Union[int, None] = None,
|
||||
return_score="relevance",
|
||||
|
||||
@@ -8,7 +8,7 @@ import inspect
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import timedelta
|
||||
from functools import cached_property
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
@@ -1015,36 +1015,15 @@ class Table(ABC):
|
||||
@abstractmethod
|
||||
def checkout(self):
|
||||
"""
|
||||
Checks out a specific version of the Table
|
||||
|
||||
Any read operation on the table will now access the data at the checked out
|
||||
version. As a consequence, calling this method will disable any read consistency
|
||||
interval that was previously set.
|
||||
|
||||
This is a read-only operation that turns the table into a sort of "view"
|
||||
or "detached head". Other table instances will not be affected. To make the
|
||||
change permanent you can use the `[Self::restore]` method.
|
||||
|
||||
Any operation that modifies the table will fail while the table is in a checked
|
||||
out state.
|
||||
|
||||
To return the table to a normal state use `[Self::checkout_latest]`
|
||||
TODO comments
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def checkout_latest(self):
|
||||
"""
|
||||
Ensures the table is pointing at the latest version
|
||||
|
||||
This can be used to manually update a table when the read_consistency_interval
|
||||
is None
|
||||
It can also be used to undo a `[Self::checkout]` operation
|
||||
TODO comments
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def list_versions(self):
|
||||
"""List all versions of the table"""
|
||||
|
||||
@cached_property
|
||||
def _dataset_uri(self) -> str:
|
||||
return _table_uri(self._conn.uri, self.name)
|
||||
@@ -2935,19 +2914,6 @@ class AsyncTable:
|
||||
"""
|
||||
return await self._inner.version()
|
||||
|
||||
async def list_versions(self):
|
||||
"""
|
||||
List all versions of the table
|
||||
"""
|
||||
versions = await self._inner.list_versions()
|
||||
for v in versions:
|
||||
ts_nanos = v["timestamp"]
|
||||
v["timestamp"] = datetime.fromtimestamp(ts_nanos // 1e9) + timedelta(
|
||||
microseconds=(ts_nanos % 1e9) // 1e3
|
||||
)
|
||||
|
||||
return versions
|
||||
|
||||
async def checkout(self, version):
|
||||
"""
|
||||
Checks out a specific version of the Table
|
||||
|
||||
@@ -11,8 +11,6 @@ from datetime import date, datetime
|
||||
from functools import singledispatch
|
||||
from typing import Tuple, Union, Optional, Any
|
||||
from urllib.parse import urlparse
|
||||
from threading import Lock
|
||||
from contextlib import contextmanager
|
||||
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
@@ -316,27 +314,3 @@ def deprecated(func):
|
||||
def validate_table_name(name: str):
|
||||
"""Verify the table name is valid."""
|
||||
native_validate_table_name(name)
|
||||
|
||||
|
||||
class ConnectionPool:
|
||||
def __init__(self, connection_factory, *, max_size: Optional[int] = None):
|
||||
self.max_size = max_size
|
||||
self._connection_factory = connection_factory
|
||||
self._pool = []
|
||||
self._lock = Lock()
|
||||
|
||||
@contextmanager
|
||||
def connection(self):
|
||||
with self._lock:
|
||||
if self._pool:
|
||||
conn = self._pool.pop()
|
||||
else:
|
||||
conn = self._connection_factory()
|
||||
|
||||
# release the lock before yielding
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
with self._lock:
|
||||
if self.max_size is None or len(self._pool) < self.max_size:
|
||||
self._pool.append(conn)
|
||||
|
||||
@@ -90,13 +90,10 @@ def test_embedding_with_bad_results(tmp_path):
|
||||
self, texts: Union[List[str], np.ndarray]
|
||||
) -> list[Union[np.array, None]]:
|
||||
# Return None, which is bad if field is non-nullable
|
||||
a = [
|
||||
np.full(self.ndims(), np.nan)
|
||||
if i % 2 == 0
|
||||
else np.random.randn(self.ndims())
|
||||
return [
|
||||
None if i % 2 == 0 else np.random.randn(self.ndims())
|
||||
for i in range(len(texts))
|
||||
]
|
||||
return a
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
registry = EmbeddingFunctionRegistry.get_instance()
|
||||
|
||||
@@ -1,6 +1,15 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
# Copyright (c) 2023. LanceDB Developers
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import importlib
|
||||
import io
|
||||
import os
|
||||
@@ -8,7 +17,6 @@ import os
|
||||
import lancedb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
from lancedb.embeddings import get_registry
|
||||
from lancedb.pydantic import LanceModel, Vector
|
||||
@@ -436,30 +444,6 @@ def test_watsonx_embedding(tmp_path):
|
||||
assert tbl.search("hello").limit(1).to_pandas()["text"][0] == "hello world"
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
os.environ.get("OPENAI_API_KEY") is None, reason="OPENAI_API_KEY not set"
|
||||
)
|
||||
def test_openai_with_empty_strs(tmp_path):
|
||||
model = get_registry().get("openai").create(max_retries=0)
|
||||
|
||||
class TextModel(LanceModel):
|
||||
text: str = model.SourceField()
|
||||
vector: Vector(model.ndims()) = model.VectorField()
|
||||
|
||||
df = pd.DataFrame({"text": ["hello world", ""]})
|
||||
db = lancedb.connect(tmp_path)
|
||||
tbl = db.create_table("test", schema=TextModel, mode="overwrite")
|
||||
|
||||
tbl.add(df, on_bad_vectors="skip")
|
||||
tb = tbl.to_arrow()
|
||||
assert tb.schema.field_by_name("vector").type == pa.list_(
|
||||
pa.float32(), model.ndims()
|
||||
)
|
||||
assert len(tb) == 2
|
||||
assert tb["vector"].is_null().to_pylist() == [False, True]
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skipif(
|
||||
importlib.util.find_spec("ollama") is None, reason="Ollama not installed"
|
||||
|
||||
@@ -1,5 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
# Copyright 2023 LanceDB Developers
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import json
|
||||
import sys
|
||||
@@ -161,26 +172,6 @@ def test_pydantic_to_arrow_py38():
|
||||
assert schema == expect_schema
|
||||
|
||||
|
||||
def test_nullable_vector():
|
||||
class NullableModel(pydantic.BaseModel):
|
||||
vec: Vector(16, nullable=False)
|
||||
|
||||
schema = pydantic_to_schema(NullableModel)
|
||||
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), False)])
|
||||
|
||||
class DefaultModel(pydantic.BaseModel):
|
||||
vec: Vector(16)
|
||||
|
||||
schema = pydantic_to_schema(DefaultModel)
|
||||
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), True)])
|
||||
|
||||
class NotNullableModel(pydantic.BaseModel):
|
||||
vec: Vector(16)
|
||||
|
||||
schema = pydantic_to_schema(NotNullableModel)
|
||||
assert schema == pa.schema([pa.field("vec", pa.list_(pa.float32(), 16), True)])
|
||||
|
||||
|
||||
def test_fixed_size_list_field():
|
||||
class TestModel(pydantic.BaseModel):
|
||||
vec: Vector(16)
|
||||
@@ -201,7 +192,7 @@ def test_fixed_size_list_field():
|
||||
schema = pydantic_to_schema(TestModel)
|
||||
assert schema == pa.schema(
|
||||
[
|
||||
pa.field("vec", pa.list_(pa.float32(), 16)),
|
||||
pa.field("vec", pa.list_(pa.float32(), 16), False),
|
||||
pa.field("li", pa.list_(pa.int64()), False),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -6,16 +6,13 @@ from datetime import timedelta
|
||||
import http.server
|
||||
import json
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from unittest.mock import MagicMock
|
||||
import uuid
|
||||
|
||||
import lancedb
|
||||
from lancedb.conftest import MockTextEmbeddingFunction
|
||||
from lancedb.remote import ClientConfig
|
||||
from lancedb.util import ConnectionPool
|
||||
from lancedb.remote.errors import HttpError, RetryError
|
||||
import lancedb.util
|
||||
import pytest
|
||||
import pyarrow as pa
|
||||
|
||||
@@ -58,34 +55,6 @@ def mock_lancedb_connection(handler):
|
||||
handle.join()
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def mock_lancedb_connection_pool(handler):
|
||||
with http.server.HTTPServer(
|
||||
("localhost", 8080), make_mock_http_handler(handler)
|
||||
) as server:
|
||||
handle = threading.Thread(target=server.serve_forever)
|
||||
handle.start()
|
||||
|
||||
def conn_factory():
|
||||
lancedb.connect(
|
||||
"db://dev",
|
||||
api_key="fake",
|
||||
host_override="http://localhost:8080",
|
||||
client_config={
|
||||
"retry_config": {"retries": 2},
|
||||
"timeout_config": {
|
||||
"connect_timeout": 1,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
yield ConnectionPool(conn_factory)
|
||||
finally:
|
||||
server.shutdown()
|
||||
handle.join()
|
||||
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def mock_lancedb_connection_async(handler):
|
||||
with http.server.HTTPServer(
|
||||
@@ -134,47 +103,6 @@ async def test_async_remote_db():
|
||||
assert table_names == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_checkout():
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/describe/":
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
response = json.dumps({"version": 42, "schema": {"fields": []}})
|
||||
request.wfile.write(response.encode())
|
||||
return
|
||||
|
||||
content_len = int(request.headers.get("Content-Length"))
|
||||
body = request.rfile.read(content_len)
|
||||
body = json.loads(body)
|
||||
|
||||
print("body is", body)
|
||||
|
||||
count = 0
|
||||
if body["version"] == 1:
|
||||
count = 100
|
||||
elif body["version"] == 2:
|
||||
count = 200
|
||||
elif body["version"] is None:
|
||||
count = 300
|
||||
|
||||
request.send_response(200)
|
||||
request.send_header("Content-Type", "application/json")
|
||||
request.end_headers()
|
||||
request.wfile.write(json.dumps(count).encode())
|
||||
|
||||
async with mock_lancedb_connection_async(handler) as db:
|
||||
table = await db.open_table("test")
|
||||
assert await table.count_rows() == 300
|
||||
await table.checkout(1)
|
||||
assert await table.count_rows() == 100
|
||||
await table.checkout(2)
|
||||
assert await table.count_rows() == 200
|
||||
await table.checkout_latest()
|
||||
assert await table.count_rows() == 300
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_http_error():
|
||||
request_id_holder = {"request_id": None}
|
||||
@@ -218,7 +146,8 @@ async def test_retry_error():
|
||||
assert cause.status_code == 429
|
||||
|
||||
|
||||
def http_handler(query_handler):
|
||||
@contextlib.contextmanager
|
||||
def query_test_table(query_handler):
|
||||
def handler(request):
|
||||
if request.path == "/v1/table/test/describe/":
|
||||
request.send_response(200)
|
||||
@@ -242,12 +171,7 @@ def http_handler(query_handler):
|
||||
request.send_response(404)
|
||||
request.end_headers()
|
||||
|
||||
return handler
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def query_test_table(connection_ctx_mgr):
|
||||
with connection_ctx_mgr as db:
|
||||
with mock_lancedb_connection(handler) as db:
|
||||
assert repr(db) == "RemoteConnect(name=dev)"
|
||||
table = db.open_table("test")
|
||||
assert repr(table) == "RemoteTable(dev.test)"
|
||||
@@ -255,7 +179,6 @@ def query_test_table(connection_ctx_mgr):
|
||||
|
||||
|
||||
def test_query_sync_minimal():
|
||||
@http_handler
|
||||
def handler(body):
|
||||
assert body == {
|
||||
"distance_type": "l2",
|
||||
@@ -265,75 +188,28 @@ def test_query_sync_minimal():
|
||||
"ef": None,
|
||||
"vector": [1.0, 2.0, 3.0],
|
||||
"nprobes": 20,
|
||||
"version": None,
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3]})
|
||||
|
||||
with query_test_table(mock_lancedb_connection(handler)) as table:
|
||||
with query_test_table(handler) as table:
|
||||
data = table.search([1, 2, 3]).to_list()
|
||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||
assert data == expected
|
||||
|
||||
with query_test_table(mock_lancedb_connection_pool(handler).connection()) as table:
|
||||
data = table.search([1, 2, 3]).to_list()
|
||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||
assert data == expected
|
||||
|
||||
|
||||
def test_query_sync_minimal_threaded():
|
||||
num_query = 0
|
||||
|
||||
@http_handler
|
||||
def handler(body):
|
||||
assert body == {
|
||||
"distance_type": "l2",
|
||||
"k": 10,
|
||||
"prefilter": False,
|
||||
"refine_factor": None,
|
||||
"ef": None,
|
||||
"vector": [1.0, 2.0, 3.0],
|
||||
"nprobes": 20,
|
||||
"version": None,
|
||||
}
|
||||
nonlocal num_query
|
||||
num_query += 1
|
||||
|
||||
return pa.table({"id": [1, 2, 3]})
|
||||
|
||||
pool = mock_lancedb_connection_pool(handler)
|
||||
|
||||
def _query(i):
|
||||
with query_test_table(pool.connection()) as table:
|
||||
data = table.search([1, 2, 3]).to_list()
|
||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||
assert data == expected
|
||||
|
||||
with ThreadPoolExecutor as exec:
|
||||
exec.map(_query, range(1000))
|
||||
|
||||
assert num_query == 1000
|
||||
|
||||
|
||||
def test_query_sync_empty_query():
|
||||
@http_handler
|
||||
def handler(body):
|
||||
assert body == {
|
||||
"k": 10,
|
||||
"filter": "true",
|
||||
"vector": [],
|
||||
"columns": ["id"],
|
||||
"version": None,
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3]})
|
||||
|
||||
with query_test_table(mock_lancedb_connection(handler)) as table:
|
||||
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||
assert data == expected
|
||||
|
||||
with query_test_table(mock_lancedb_connection_pool(handler).connection()) as table:
|
||||
with query_test_table(handler) as table:
|
||||
data = table.search(None).where("true").select(["id"]).limit(10).to_list()
|
||||
expected = [{"id": 1}, {"id": 2}, {"id": 3}]
|
||||
assert data == expected
|
||||
@@ -354,7 +230,6 @@ def test_query_sync_maximal():
|
||||
"vector_column": "vector2",
|
||||
"fast_search": True,
|
||||
"with_row_id": True,
|
||||
"version": None,
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
|
||||
@@ -393,7 +268,6 @@ def test_query_sync_fts():
|
||||
},
|
||||
"k": 10,
|
||||
"vector": [],
|
||||
"version": None,
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3]})
|
||||
@@ -410,7 +284,6 @@ def test_query_sync_fts():
|
||||
"k": 42,
|
||||
"vector": [],
|
||||
"with_row_id": True,
|
||||
"version": None,
|
||||
}
|
||||
|
||||
return pa.table({"id": [1, 2, 3]})
|
||||
@@ -436,7 +309,6 @@ def test_query_sync_hybrid():
|
||||
"k": 42,
|
||||
"vector": [],
|
||||
"with_row_id": True,
|
||||
"version": None,
|
||||
}
|
||||
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
|
||||
else:
|
||||
@@ -450,7 +322,6 @@ def test_query_sync_hybrid():
|
||||
"nprobes": 20,
|
||||
"ef": None,
|
||||
"with_row_id": True,
|
||||
"version": None,
|
||||
}
|
||||
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ use lancedb::table::{
|
||||
use pyo3::{
|
||||
exceptions::{PyRuntimeError, PyValueError},
|
||||
pyclass, pymethods,
|
||||
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
|
||||
types::{PyDict, PyDictMethods, PyString},
|
||||
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
|
||||
};
|
||||
use pyo3_asyncio_0_21::tokio::future_into_py;
|
||||
@@ -246,33 +246,6 @@ impl Table {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn list_versions(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
let versions = inner.list_versions().await.infer_error()?;
|
||||
let versions_as_dict = Python::with_gil(|py| {
|
||||
versions
|
||||
.iter()
|
||||
.map(|v| {
|
||||
let dict = PyDict::new_bound(py);
|
||||
dict.set_item("version", v.version).unwrap();
|
||||
dict.set_item(
|
||||
"timestamp",
|
||||
v.timestamp.timestamp_nanos_opt().unwrap_or_default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let tup: Vec<(&String, &String)> = v.metadata.iter().collect();
|
||||
dict.set_item("metadata", tup.into_py_dict(py)).unwrap();
|
||||
dict.to_object(py)
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
|
||||
Ok(versions_as_dict)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
future_into_py(self_.py(), async move {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-node"
|
||||
version = "0.13.1-beta.0"
|
||||
version = "0.13.0"
|
||||
description = "Serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
edition.workspace = true
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.13.1-beta.0"
|
||||
version = "0.13.0"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
|
||||
@@ -19,7 +19,7 @@ use http::header::CONTENT_TYPE;
|
||||
use http::StatusCode;
|
||||
use lance::arrow::json::JsonSchema;
|
||||
use lance::dataset::scanner::DatasetRecordBatchStream;
|
||||
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
|
||||
use lance::dataset::{ColumnAlteration, NewColumnTransform};
|
||||
use lance_datafusion::exec::OneShotExec;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::RwLock;
|
||||
@@ -363,34 +363,6 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
|
||||
message: "restore is not supported on LanceDB cloud.".into(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||
let request = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/version/list/", self.name));
|
||||
let (request_id, response) = self.client.send(request, true).await?;
|
||||
let response = self.check_table_response(&request_id, response).await?;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct ListVersionsResponse {
|
||||
versions: Vec<Version>,
|
||||
}
|
||||
|
||||
let body = response.text().await.err_to_http(request_id.clone())?;
|
||||
let body: ListVersionsResponse =
|
||||
serde_json::from_str(&body).map_err(|err| Error::Http {
|
||||
source: format!(
|
||||
"Failed to parse list_versions response: {}, body: {}",
|
||||
err, body
|
||||
)
|
||||
.into(),
|
||||
request_id,
|
||||
status_code: None,
|
||||
})?;
|
||||
|
||||
Ok(body.versions)
|
||||
}
|
||||
|
||||
async fn schema(&self) -> Result<SchemaRef> {
|
||||
let schema = self.describe().await?.schema;
|
||||
Ok(Arc::new(schema.try_into()?))
|
||||
@@ -803,7 +775,6 @@ mod tests {
|
||||
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
|
||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
|
||||
use lance_index::scalar::FullTextSearchQuery;
|
||||
use reqwest::Body;
|
||||
@@ -1518,51 +1489,6 @@ mod tests {
|
||||
assert_eq!(indices, expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_list_versions() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
assert_eq!(request.method(), "POST");
|
||||
assert_eq!(request.url().path(), "/v1/table/my_table/version/list/");
|
||||
|
||||
let version1 = lance::dataset::Version {
|
||||
version: 1,
|
||||
timestamp: "2024-01-01T00:00:00Z".parse().unwrap(),
|
||||
metadata: Default::default(),
|
||||
};
|
||||
let version2 = lance::dataset::Version {
|
||||
version: 2,
|
||||
timestamp: "2024-02-01T00:00:00Z".parse().unwrap(),
|
||||
metadata: Default::default(),
|
||||
};
|
||||
let response_body = serde_json::json!({
|
||||
"versions": [
|
||||
version1,
|
||||
version2,
|
||||
]
|
||||
});
|
||||
let response_body = serde_json::to_string(&response_body).unwrap();
|
||||
|
||||
http::Response::builder()
|
||||
.status(200)
|
||||
.body(response_body)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let versions = table.list_versions().await.unwrap();
|
||||
assert_eq!(versions.len(), 2);
|
||||
assert_eq!(versions[0].version, 1);
|
||||
assert_eq!(
|
||||
versions[0].timestamp,
|
||||
"2024-01-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
|
||||
);
|
||||
assert_eq!(versions[1].version, 2);
|
||||
assert_eq!(
|
||||
versions[1].timestamp,
|
||||
"2024-02-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
|
||||
);
|
||||
// assert_eq!(versions, expected);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_index_stats() {
|
||||
let table = Table::new_with_handler("my_table", |request| {
|
||||
|
||||
@@ -37,7 +37,7 @@ pub use lance::dataset::ColumnAlteration;
|
||||
pub use lance::dataset::NewColumnTransform;
|
||||
pub use lance::dataset::ReadParams;
|
||||
use lance::dataset::{
|
||||
Dataset, UpdateBuilder as LanceUpdateBuilder, Version, WhenMatched, WriteMode, WriteParams,
|
||||
Dataset, UpdateBuilder as LanceUpdateBuilder, WhenMatched, WriteMode, WriteParams,
|
||||
};
|
||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||
use lance::io::WrappingObjectStore;
|
||||
@@ -426,7 +426,6 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
|
||||
async fn checkout(&self, version: u64) -> Result<()>;
|
||||
async fn checkout_latest(&self) -> Result<()>;
|
||||
async fn restore(&self) -> Result<()>;
|
||||
async fn list_versions(&self) -> Result<Vec<Version>>;
|
||||
async fn table_definition(&self) -> Result<TableDefinition>;
|
||||
fn dataset_uri(&self) -> &str;
|
||||
}
|
||||
@@ -956,11 +955,6 @@ impl Table {
|
||||
self.inner.restore().await
|
||||
}
|
||||
|
||||
/// List all the versions of the table
|
||||
pub async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||
self.inner.list_versions().await
|
||||
}
|
||||
|
||||
/// List all indices that have been created with [`Self::create_index`]
|
||||
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
|
||||
self.inner.list_indices().await
|
||||
@@ -1325,7 +1319,7 @@ impl NativeTable {
|
||||
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
|
||||
Ok(indices
|
||||
.iter()
|
||||
.map(|i| VectorIndex::new_from_format(&(mf.0), i))
|
||||
.map(|i| VectorIndex::new_from_format(&mf, i))
|
||||
.collect())
|
||||
}
|
||||
|
||||
@@ -1713,10 +1707,6 @@ impl TableInternal for NativeTable {
|
||||
self.dataset.reload().await
|
||||
}
|
||||
|
||||
async fn list_versions(&self) -> Result<Vec<Version>> {
|
||||
Ok(self.dataset.get().await?.versions().await?)
|
||||
}
|
||||
|
||||
async fn restore(&self) -> Result<()> {
|
||||
let version =
|
||||
self.dataset
|
||||
|
||||
Reference in New Issue
Block a user