Compare commits

..

10 Commits

Author SHA1 Message Date
Lance Release
96933d7df8 Bump version: 0.16.0 → 0.16.1-beta.0 2024-11-21 21:52:39 +00:00
Lei Xu
d369233b3d feat: bump lance to 0.20.0b2 (#1865)
Bump lance version.
Upstream change log:
https://github.com/lancedb/lance/releases/tag/v0.20.0-beta.2
2024-11-21 13:16:59 -08:00
QianZhu
43a670ed4b fix: limit docstring change (#1860) 2024-11-21 10:50:50 -08:00
Bert
cb9a00a28d feat: add list_versions to typescript, rust and remote python sdks (#1850)
Will require update to lance dependency to bring in this change which
makes the version serializable
https://github.com/lancedb/lance/pull/3143
2024-11-21 13:35:14 -05:00
Max Epstein
72af977a73 fix(CohereReranker): updated default model_name param to newest v3 (#1862) 2024-11-21 09:02:49 -08:00
Bert
7cecb71df0 feat: support for checkout and checkout_latest in remote sdks (#1863) 2024-11-21 11:28:46 -05:00
QianZhu
285071e5c8 docs: full-text search doc update (#1861)
Co-authored-by: BubbleCal <bubble-cal@outlook.com>
2024-11-20 21:07:30 -08:00
QianZhu
114866fbcf docs: OSS doc improvement (#1859)
OSS doc improvement - HNSW index parameter explanation and others.

---------

Co-authored-by: BubbleCal <bubble-cal@outlook.com>
2024-11-20 17:51:11 -08:00
Frank Liu
5387c0e243 docs: add Voyage models to sidebar (#1858) 2024-11-20 14:20:14 -08:00
Mr. Doge
53d1535de1 ci: musl x64,arm64 (#1853)
untested 4 artifacts at:
https://github.com/FuPeiJiang/lancedb/actions/runs/11926579058
node-native-linux-aarch64-musl 22.6 MB
node-native-linux-x86_64-musl 23.6 MB
nodejs-native-linux-aarch64-musl 26.7 MB
nodejs-native-linux-x86_64-musl 27 MB

this follows the same process as:
https://github.com/lancedb/lancedb/pull/1816#issuecomment-2484816669

Closes #1388
Closes #1107

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
2024-11-20 10:53:19 -08:00
32 changed files with 513 additions and 47 deletions

View File

@@ -87,6 +87,16 @@ glob = "node/package.json"
replace = "\"@lancedb/vectordb-linux-x64-gnu\": \"{new_version}\""
search = "\"@lancedb/vectordb-linux-x64-gnu\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-linux-arm64-musl\": \"{new_version}\""
search = "\"@lancedb/vectordb-linux-arm64-musl\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-linux-x64-musl\": \"{new_version}\""
search = "\"@lancedb/vectordb-linux-x64-musl\": \"{current_version}\""
[[tool.bumpversion.files]]
glob = "node/package.json"
replace = "\"@lancedb/vectordb-win32-x64-msvc\": \"{new_version}\""

View File

@@ -31,6 +31,9 @@ rustflags = [
[target.x86_64-unknown-linux-gnu]
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=+avx2,+fma,+f16c"]
[target.x86_64-unknown-linux-musl]
rustflags = ["-C", "target-cpu=haswell", "-C", "target-feature=-crt-static,+avx2,+fma,+f16c"]
[target.aarch64-apple-darwin]
rustflags = ["-C", "target-cpu=apple-m1", "-C", "target-feature=+neon,+fp16,+fhm,+dotprod"]

View File

@@ -101,7 +101,7 @@ jobs:
path: |
nodejs/dist/*.node
node-linux:
node-linux-gnu:
name: vectordb (${{ matrix.config.arch}}-unknown-linux-gnu)
runs-on: ${{ matrix.config.runner }}
# Only runs on tags that matches the make-release action
@@ -137,11 +137,63 @@ jobs:
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: node-native-linux-${{ matrix.config.arch }}
name: node-native-linux-${{ matrix.config.arch }}-gnu
path: |
node/dist/lancedb-vectordb-linux*.tgz
nodejs-linux:
node-linux-musl:
name: vectordb (${{ matrix.config.arch}}-unknown-linux-musl)
runs-on: ubuntu-latest
container: alpine:edge
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
strategy:
fail-fast: false
matrix:
config:
- arch: x86_64
- arch: aarch64
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install common dependencies
run: |
apk add protobuf-dev curl clang mold grep npm bash
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
echo "source $HOME/.cargo/env" >> saved_env
echo "export CC=clang" >> saved_env
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
- name: Configure aarch64 build
if: ${{ matrix.config.arch == 'aarch64' }}
run: |
source "$HOME/.cargo/env"
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
curl -sSf $apk_url > apk_list
for pkg in gcc libgcc musl; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
mkdir -p $sysroot_lib
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
cp usr/lib/libgcc_s.so.1 $sysroot_lib
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
echo '!<arch>' > $sysroot_lib/libdl.a
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
echo "export RUSTFLAGS='-Ctarget-cpu=apple-m1 -Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
- name: Build Linux Artifacts
run: |
source ./saved_env
bash ci/manylinux_node/build_vectordb.sh ${{ matrix.config.arch }}
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: node-native-linux-${{ matrix.config.arch }}-musl
path: |
node/dist/lancedb-vectordb-linux*.tgz
nodejs-linux-gnu:
name: lancedb (${{ matrix.config.arch}}-unknown-linux-gnu
runs-on: ${{ matrix.config.runner }}
# Only runs on tags that matches the make-release action
@@ -178,7 +230,7 @@ jobs:
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: nodejs-native-linux-${{ matrix.config.arch }}
name: nodejs-native-linux-${{ matrix.config.arch }}-gnu
path: |
nodejs/dist/*.node
# The generic files are the same in all distros so we just pick
@@ -192,6 +244,62 @@ jobs:
nodejs/dist/*
!nodejs/dist/*.node
nodejs-linux-musl:
name: lancedb (${{ matrix.config.arch}}-unknown-linux-musl
runs-on: ubuntu-latest
container: alpine:edge
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
strategy:
fail-fast: false
matrix:
config:
- arch: x86_64
- arch: aarch64
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install common dependencies
run: |
apk add protobuf-dev curl clang mold grep npm bash openssl-dev openssl-libs-static
curl --proto '=https' --tlsv1.3 -sSf https://raw.githubusercontent.com/rust-lang/rustup/refs/heads/master/rustup-init.sh | sh -s -- -y --default-toolchain 1.80.0
echo "source $HOME/.cargo/env" >> saved_env
echo "export CC=clang" >> saved_env
echo "export RUSTFLAGS='-Ctarget-cpu=haswell -Ctarget-feature=-crt-static,+avx2,+fma,+f16c -Clinker=clang -Clink-arg=-fuse-ld=mold'" >> saved_env
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=/usr/include" >> saved_env
echo "export X86_64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=/usr/lib" >> saved_env
- name: Configure aarch64 build
if: ${{ matrix.config.arch == 'aarch64' }}
run: |
source "$HOME/.cargo/env"
rustup target add aarch64-unknown-linux-musl --toolchain 1.80.0
crt=$(realpath $(dirname $(rustup which rustc))/../lib/rustlib/aarch64-unknown-linux-musl/lib/self-contained)
sysroot_lib=/usr/aarch64-unknown-linux-musl/usr/lib
apk_url=https://dl-cdn.alpinelinux.org/alpine/latest-stable/main/aarch64/
curl -sSf $apk_url > apk_list
for pkg in gcc libgcc musl openssl-dev openssl-libs-static; do curl -sSf $apk_url$(cat apk_list | grep -oP '(?<=")'$pkg'-\d.*?(?=")') | tar zxf -; done
mkdir -p $sysroot_lib
echo 'GROUP ( libgcc_s.so.1 -lgcc )' > $sysroot_lib/libgcc_s.so
cp usr/lib/libgcc_s.so.1 $sysroot_lib
cp usr/lib/gcc/aarch64-alpine-linux-musl/*/libgcc.a $sysroot_lib
cp lib/ld-musl-aarch64.so.1 $sysroot_lib/libc.so
echo '!<arch>' > $sysroot_lib/libdl.a
(cd $crt && cp crti.o crtbeginS.o crtendS.o crtn.o -t $sysroot_lib)
echo "export CARGO_BUILD_TARGET=aarch64-unknown-linux-musl" >> saved_env
echo "export RUSTFLAGS='-Ctarget-feature=-crt-static,+neon,+fp16,+fhm,+dotprod -Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=--target=aarch64-unknown-linux-musl -Clink-arg=--sysroot=/usr/aarch64-unknown-linux-musl -Clink-arg=-lc'" >> saved_env
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_INCLUDE_DIR=$(realpath usr/include)" >> saved_env
echo "export AARCH64_UNKNOWN_LINUX_MUSL_OPENSSL_LIB_DIR=$(realpath usr/lib)" >> saved_env
- name: Build Linux Artifacts
run: |
source ./saved_env
bash ci/manylinux_node/build_lancedb.sh ${{ matrix.config.arch }}
- name: Upload Linux Artifacts
uses: actions/upload-artifact@v4
with:
name: nodejs-native-linux-${{ matrix.config.arch }}-musl
path: |
nodejs/dist/*.node
node-windows:
name: vectordb ${{ matrix.target }}
runs-on: windows-2022
@@ -460,7 +568,7 @@ jobs:
release:
name: vectordb NPM Publish
needs: [node, node-macos, node-linux, node-windows]
needs: [node, node-macos, node-linux-gnu, node-linux-musl, node-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')
@@ -500,7 +608,7 @@ jobs:
release-nodejs:
name: lancedb NPM Publish
needs: [nodejs-macos, nodejs-linux, nodejs-windows]
needs: [nodejs-macos, nodejs-linux-gnu, nodejs-linux-musl, nodejs-windows]
runs-on: ubuntu-latest
# Only runs on tags that matches the make-release action
if: startsWith(github.ref, 'refs/tags/v')

View File

@@ -21,15 +21,15 @@ categories = ["database-implementations"]
rust-version = "1.80.0" # TODO: lower this once we upgrade Lance again.
[workspace.dependencies]
lance = { "version" = "=0.19.3", "features" = [
lance = { "version" = "=0.20.0", "features" = [
"dynamodb",
], git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-index = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-linalg = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-table = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-testing = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-datafusion = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
lance-encoding = { version = "=0.19.3", git = "https://github.com/lancedb/lance.git", tag = "v0.19.3-beta.1" }
], git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-index = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-linalg = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-table = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-testing = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-datafusion = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
lance-encoding = { version = "=0.20.0", git = "https://github.com/lancedb/lance.git", tag = "v0.20.0-beta.2" }
# Note that this one does not include pyarrow
arrow = { version = "52.2", optional = false }
arrow-array = "52.2"

View File

@@ -11,7 +11,8 @@ fi
export OPENSSL_STATIC=1
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
source $HOME/.bashrc
#Alpine doesn't have .bashrc
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
cd nodejs
npm ci

View File

@@ -5,13 +5,14 @@ ARCH=${1:-x86_64}
if [ "$ARCH" = "x86_64" ]; then
export OPENSSL_LIB_DIR=/usr/local/lib64/
else
else
export OPENSSL_LIB_DIR=/usr/local/lib/
fi
export OPENSSL_STATIC=1
export OPENSSL_INCLUDE_DIR=/usr/local/include/openssl
source $HOME/.bashrc
#Alpine doesn't have .bashrc
FILE=$HOME/.bashrc && test -f $FILE && source $FILE
cd node
npm ci

View File

@@ -138,6 +138,7 @@ nav:
- Jina Reranker: reranking/jina.md
- OpenAI Reranker: reranking/openai.md
- AnswerDotAi Rerankers: reranking/answerdotai.md
- Voyage AI Rerankers: reranking/voyageai.md
- Building Custom Rerankers: reranking/custom_reranker.md
- Example: notebooks/lancedb_reranking.ipynb
- Filtering: sql.md
@@ -165,6 +166,7 @@ nav:
- Jina Embeddings: embeddings/available_embedding_models/text_embedding_functions/jina_embedding.md
- AWS Bedrock Text Embedding Functions: embeddings/available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md
- IBM watsonx.ai Embeddings: embeddings/available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md
- Voyage AI Embeddings: embeddings/available_embedding_models/text_embedding_functions/voyageai_embedding.md
- Multimodal Embedding Functions:
- OpenClip embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
- Imagebind embeddings: embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md

View File

@@ -277,7 +277,15 @@ Product quantization can lead to approximately `16 * sizeof(float32) / 1 = 64` t
Higher number of partitions could lead to more efficient I/O during queries and better accuracy, but it takes much more time to train.
On `SIFT-1M` dataset, our benchmark shows that keeping each partition 1K-4K rows lead to a good latency / recall.
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. Because
`num_sub_vectors` specifies how many Product Quantization (PQ) short codes to generate on each vector. The number should be a factor of the vector dimension. Because
PQ is a lossy compression of the original vector, a higher `num_sub_vectors` usually results in
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and
more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
less space distortion, and thus yields better accuracy. However, a higher `num_sub_vectors` also causes heavier I/O and more PQ computation, and thus, higher latency. `dimension / num_sub_vectors` should be a multiple of 8 for optimum SIMD efficiency.
!!! note
if `num_sub_vectors` is set to be greater than the vector dimension, you will see errors like `attempt to divide by zero`
### How to choose `m` and `ef_construction` for `IVF_HNSW_*` index?
`m` determines the number of connections a new node establishes with its closest neighbors upon entering the graph. Typically, `m` falls within the range of 5 to 48. Lower `m` values are suitable for low-dimensional data or scenarios where recall is less critical. Conversely, higher `m` values are beneficial for high-dimensional data or when high recall is required. In essence, a larger `m` results in a denser graph with increased connectivity, but at the expense of higher memory consumption.
`ef_construction` balances build speed and accuracy. Higher values increase accuracy but slow down the build process. A typical range is 150 to 300. For good search results, a minimum value of 100 is recommended. In most cases, setting this value above 500 offers no additional benefit. Ensure that `ef_construction` is always set to a value equal to or greater than `ef` in the search phase

View File

@@ -57,6 +57,13 @@ Then the greedy search routine operates as follows:
## Usage
There are three key parameters to set when constructing an HNSW index:
* `metric`: Use an `L2` euclidean distance metric. We also support `dot` and `cosine` distance.
* `m`: The number of neighbors to select for each vector in the HNSW graph.
* `ef_construction`: The number of candidates to evaluate during the construction of the HNSW graph.
We can combine the above concepts to understand how to build and query an HNSW index in LanceDB.
### Construct index

View File

@@ -58,8 +58,10 @@ In Python, the index can be created as follows:
# Make sure you have enough data in the table for an effective training step
tbl.create_index(metric="L2", num_partitions=256, num_sub_vectors=96)
```
!!! note
`num_partitions`=256 and `num_sub_vectors`=96 does not work for every dataset. Those values needs to be adjusted for your particular dataset.
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See the [FAQs](#faq) below for best practices on choosing these parameters.
The `num_partitions` is usually chosen to target a particular number of vectors per partition. `num_sub_vectors` is typically chosen based on the desired recall and the dimensionality of the vector. See [here](../ann_indexes.md/#how-to-choose-num_partitions-and-num_sub_vectors-for-ivf_pq-index) for best practices on choosing these parameters.
### Query the index

View File

@@ -114,12 +114,45 @@ table.create_fts_index("text",
LanceDB full text search supports to filter the search results by a condition, both pre-filtering and post-filtering are supported.
This can be invoked via the familiar `where` syntax:
This can be invoked via the familiar `where` syntax.
With pre-filtering:
=== "Python"
```python
table.search("puppy").limit(10).where("meta='foo'").to_list()
table.search("puppy").limit(10).where("meta='foo'", prefilte=True).to_list()
```
=== "TypeScript"
```typescript
await tbl
.search("puppy")
.select(["id", "doc"])
.limit(10)
.where("meta='foo'")
.prefilter(true)
.toArray();
```
=== "Rust"
```rust
table
.query()
.full_text_search(FullTextSearchQuery::new("puppy".to_owned()))
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
.limit(10)
.only_if("meta='foo'")
.execute()
.await?;
```
With post-filtering:
=== "Python"
```python
table.search("puppy").limit(10).where("meta='foo'", prefilte=False).to_list()
```
=== "TypeScript"
@@ -130,6 +163,7 @@ This can be invoked via the familiar `where` syntax:
.select(["id", "doc"])
.limit(10)
.where("meta='foo'")
.prefilter(false)
.toArray();
```
@@ -140,6 +174,7 @@ This can be invoked via the familiar `where` syntax:
.query()
.full_text_search(FullTextSearchQuery::new(words[0].to_owned()))
.select(lancedb::query::Select::Columns(vec!["doc".to_owned()]))
.postfilter()
.limit(10)
.only_if("meta='foo'")
.execute()
@@ -189,3 +224,6 @@ This can make the query more efficient, especially when the table is large and t
tbl.add(more_data).execute().await?;
tbl.optimize(OptimizeAction::All).execute().await?;
```
!!! note
New data added after creating the FTS index will appear in search results while incremental index is still progress, but with increased latency due to a flat search on the unindexed portion. LanceDB Cloud automates this merging process, minimizing the impact on search speed.

View File

@@ -153,9 +153,7 @@ table.create_fts_index(["title", "content"], use_tantivy=True, writer_heap_size=
## Current limitations
1. Currently we do not yet support incremental writes.
If you add data after FTS index creation, it won't be reflected
in search results until you do a full reindex.
1. New data added after creating the FTS index will appear in search results, but with increased latency due to a flat search on the unindexed portion. Re-indexing with `create_fts_index` will reduce latency. LanceDB Cloud automates this merging process, minimizing the impact on search speed.
2. We currently only support local filesystem paths for the FTS index.
This is a tantivy limitation. We've implemented an object store plugin

View File

@@ -6,6 +6,9 @@ This re-ranker uses the [Cohere](https://cohere.ai/) API to rerank the search re
!!! note
Supported Query Types: Hybrid, Vector, FTS
```shell
pip install cohere
```
```python
import numpy

View File

@@ -89,10 +89,12 @@
}
},
"optionalDependencies": {
"@lancedb/vectordb-darwin-arm64": "0.13.0",
"@lancedb/vectordb-darwin-x64": "0.13.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
"@lancedb/vectordb-darwin-arm64": "0.13.0",
"@lancedb/vectordb-linux-x64-gnu": "0.13.0",
"@lancedb/vectordb-linux-arm64-gnu": "0.13.0",
"@lancedb/vectordb-linux-x64-musl": "0.13.0",
"@lancedb/vectordb-linux-arm64-musl": "0.13.0",
"@lancedb/vectordb-win32-x64-msvc": "0.13.0",
"@lancedb/vectordb-win32-arm64-msvc": "0.13.0"
}

View File

@@ -87,6 +87,12 @@ export interface OptimizeOptions {
deleteUnverified: boolean;
}
export interface Version {
version: number;
timestamp: Date;
metadata: Record<string, string>;
}
/**
* A Table is a collection of Records in a LanceDB Database.
*
@@ -360,6 +366,11 @@ export abstract class Table {
*/
abstract checkoutLatest(): Promise<void>;
/**
* List all the versions of the table
*/
abstract listVersions(): Promise<Version[]>;
/**
* Restore the table to the currently checked out version
*
@@ -659,6 +670,14 @@ export class LocalTable extends Table {
await this.inner.checkoutLatest();
}
async listVersions(): Promise<Version[]> {
return (await this.inner.listVersions()).map((version) => ({
version: version.version,
timestamp: new Date(version.timestamp / 1000),
metadata: version.metadata,
}));
}
async restore(): Promise<void> {
await this.inner.restore();
}

View File

@@ -0,0 +1,3 @@
# `@lancedb/lancedb-linux-arm64-musl`
This is the **aarch64-unknown-linux-musl** binary for `@lancedb/lancedb`

View File

@@ -0,0 +1,13 @@
{
"name": "@lancedb/lancedb-linux-arm64-musl",
"version": "0.13.0",
"os": ["linux"],
"cpu": ["arm64"],
"main": "lancedb.linux-arm64-musl.node",
"files": ["lancedb.linux-arm64-musl.node"],
"license": "Apache 2.0",
"engines": {
"node": ">= 18"
},
"libc": ["musl"]
}

View File

@@ -0,0 +1,3 @@
# `@lancedb/lancedb-linux-x64-musl`
This is the **x86_64-unknown-linux-musl** binary for `@lancedb/lancedb`

View File

@@ -0,0 +1,13 @@
{
"name": "@lancedb/lancedb-linux-x64-musl",
"version": "0.13.0",
"os": ["linux"],
"cpu": ["x64"],
"main": "lancedb.linux-x64-musl.node",
"files": ["lancedb.linux-x64-musl.node"],
"license": "Apache 2.0",
"engines": {
"node": ">= 18"
},
"libc": ["musl"]
}

View File

@@ -24,10 +24,12 @@
"triples": {
"defaults": false,
"additional": [
"aarch64-apple-darwin",
"aarch64-unknown-linux-gnu",
"x86_64-apple-darwin",
"aarch64-apple-darwin",
"x86_64-unknown-linux-gnu",
"aarch64-unknown-linux-gnu",
"x86_64-unknown-linux-musl",
"aarch64-unknown-linux-musl",
"x86_64-pc-windows-msvc"
]
}

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use arrow_ipc::writer::FileWriter;
use lancedb::ipc::ipc_file_to_batches;
use lancedb::table::{
@@ -226,6 +228,28 @@ impl Table {
self.inner_ref()?.checkout_latest().await.default_error()
}
#[napi(catch_unwind)]
pub async fn list_versions(&self) -> napi::Result<Vec<Version>> {
self.inner_ref()?
.list_versions()
.await
.map(|versions| {
versions
.iter()
.map(|version| Version {
version: version.version as i64,
timestamp: version.timestamp.timestamp_micros(),
metadata: version
.metadata
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect(),
})
.collect()
})
.default_error()
}
#[napi(catch_unwind)]
pub async fn restore(&self) -> napi::Result<()> {
self.inner_ref()?.restore().await.default_error()
@@ -466,3 +490,10 @@ impl From<lancedb::index::IndexStatistics> for IndexStatistics {
}
}
}
#[napi(object)]
pub struct Version {
pub version: i64,
pub timestamp: i64,
pub metadata: HashMap<String, String>,
}

View File

@@ -1,5 +1,5 @@
[tool.bumpversion]
current_version = "0.16.0"
current_version = "0.16.1-beta.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.

View File

@@ -1,6 +1,6 @@
[package]
name = "lancedb-python"
version = "0.16.0"
version = "0.16.1-beta.0"
edition.workspace = true
description = "Python bindings for LanceDB"
license.workspace = true

View File

@@ -4,7 +4,7 @@ name = "lancedb"
dependencies = [
"deprecation",
"nest-asyncio~=1.0",
"pylance==0.19.3b1",
"pylance==0.20.0b2",
"tqdm>=4.27.0",
"pydantic>=1.10",
"packaging",

View File

@@ -370,11 +370,13 @@ class LanceQueryBuilder(ABC):
----------
limit: int
The maximum number of results to return.
By default the query is limited to the first 10.
Call this method and pass 0, a negative value,
or None to remove the limit.
*WARNING* if you have a large dataset, removing
the limit can potentially result in reading a
The default query limit is 10 results.
For ANN/KNN queries, you must specify a limit.
Entering 0, a negative number, or None will reset
the limit to the default value of 10.
*WARNING* if you have a large dataset, setting
the limit to a large number, e.g. the table size,
can potentially result in reading a
large amount of data into memory and cause
out of memory issues.

View File

@@ -78,6 +78,10 @@ class RemoteTable(Table):
self.schema.metadata
)
def list_versions(self):
"""List all versions of the table"""
return self._loop.run_until_complete(self._table.list_versions())
def to_arrow(self) -> pa.Table:
"""to_arrow() is not yet supported on LanceDB cloud."""
raise NotImplementedError("to_arrow() is not yet supported on LanceDB cloud.")

View File

@@ -41,7 +41,7 @@ class CohereReranker(Reranker):
def __init__(
self,
model_name: str = "rerank-english-v2.0",
model_name: str = "rerank-english-v3.0",
column: str = "text",
top_n: Union[int, None] = None,
return_score="relevance",

View File

@@ -8,7 +8,7 @@ import inspect
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import timedelta
from datetime import datetime, timedelta
from functools import cached_property
from typing import (
TYPE_CHECKING,
@@ -1015,15 +1015,36 @@ class Table(ABC):
@abstractmethod
def checkout(self):
"""
TODO comments
Checks out a specific version of the Table
Any read operation on the table will now access the data at the checked out
version. As a consequence, calling this method will disable any read consistency
interval that was previously set.
This is a read-only operation that turns the table into a sort of "view"
or "detached head". Other table instances will not be affected. To make the
change permanent you can use the `[Self::restore]` method.
Any operation that modifies the table will fail while the table is in a checked
out state.
To return the table to a normal state use `[Self::checkout_latest]`
"""
@abstractmethod
def checkout_latest(self):
"""
TODO comments
Ensures the table is pointing at the latest version
This can be used to manually update a table when the read_consistency_interval
is None
It can also be used to undo a `[Self::checkout]` operation
"""
@abstractmethod
def list_versions(self):
"""List all versions of the table"""
@cached_property
def _dataset_uri(self) -> str:
return _table_uri(self._conn.uri, self.name)
@@ -2914,6 +2935,19 @@ class AsyncTable:
"""
return await self._inner.version()
async def list_versions(self):
"""
List all versions of the table
"""
versions = await self._inner.list_versions()
for v in versions:
ts_nanos = v["timestamp"]
v["timestamp"] = datetime.fromtimestamp(ts_nanos // 1e9) + timedelta(
microseconds=(ts_nanos % 1e9) // 1e3
)
return versions
async def checkout(self, version):
"""
Checks out a specific version of the Table

View File

@@ -103,6 +103,47 @@ async def test_async_remote_db():
assert table_names == []
@pytest.mark.asyncio
async def test_async_checkout():
def handler(request):
if request.path == "/v1/table/test/describe/":
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
response = json.dumps({"version": 42, "schema": {"fields": []}})
request.wfile.write(response.encode())
return
content_len = int(request.headers.get("Content-Length"))
body = request.rfile.read(content_len)
body = json.loads(body)
print("body is", body)
count = 0
if body["version"] == 1:
count = 100
elif body["version"] == 2:
count = 200
elif body["version"] is None:
count = 300
request.send_response(200)
request.send_header("Content-Type", "application/json")
request.end_headers()
request.wfile.write(json.dumps(count).encode())
async with mock_lancedb_connection_async(handler) as db:
table = await db.open_table("test")
assert await table.count_rows() == 300
await table.checkout(1)
assert await table.count_rows() == 100
await table.checkout(2)
assert await table.count_rows() == 200
await table.checkout_latest()
assert await table.count_rows() == 300
@pytest.mark.asyncio
async def test_http_error():
request_id_holder = {"request_id": None}
@@ -188,6 +229,7 @@ def test_query_sync_minimal():
"ef": None,
"vector": [1.0, 2.0, 3.0],
"nprobes": 20,
"version": None,
}
return pa.table({"id": [1, 2, 3]})
@@ -205,6 +247,7 @@ def test_query_sync_empty_query():
"filter": "true",
"vector": [],
"columns": ["id"],
"version": None,
}
return pa.table({"id": [1, 2, 3]})
@@ -230,6 +273,7 @@ def test_query_sync_maximal():
"vector_column": "vector2",
"fast_search": True,
"with_row_id": True,
"version": None,
}
return pa.table({"id": [1, 2, 3], "name": ["a", "b", "c"]})
@@ -268,6 +312,7 @@ def test_query_sync_fts():
},
"k": 10,
"vector": [],
"version": None,
}
return pa.table({"id": [1, 2, 3]})
@@ -284,6 +329,7 @@ def test_query_sync_fts():
"k": 42,
"vector": [],
"with_row_id": True,
"version": None,
}
return pa.table({"id": [1, 2, 3]})
@@ -309,6 +355,7 @@ def test_query_sync_hybrid():
"k": 42,
"vector": [],
"with_row_id": True,
"version": None,
}
return pa.table({"_rowid": [1, 2, 3], "_score": [0.1, 0.2, 0.3]})
else:
@@ -322,6 +369,7 @@ def test_query_sync_hybrid():
"nprobes": 20,
"ef": None,
"with_row_id": True,
"version": None,
}
return pa.table({"_rowid": [1, 2, 3], "_distance": [0.1, 0.2, 0.3]})

View File

@@ -8,7 +8,7 @@ use lancedb::table::{
use pyo3::{
exceptions::{PyRuntimeError, PyValueError},
pyclass, pymethods,
types::{PyDict, PyDictMethods, PyString},
types::{IntoPyDict, PyDict, PyDictMethods, PyString},
Bound, FromPyObject, PyAny, PyRef, PyResult, Python, ToPyObject,
};
use pyo3_asyncio_0_21::tokio::future_into_py;
@@ -246,6 +246,33 @@ impl Table {
)
}
pub fn list_versions(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {
let versions = inner.list_versions().await.infer_error()?;
let versions_as_dict = Python::with_gil(|py| {
versions
.iter()
.map(|v| {
let dict = PyDict::new_bound(py);
dict.set_item("version", v.version).unwrap();
dict.set_item(
"timestamp",
v.timestamp.timestamp_nanos_opt().unwrap_or_default(),
)
.unwrap();
let tup: Vec<(&String, &String)> = v.metadata.iter().collect();
dict.set_item("metadata", tup.into_py_dict(py)).unwrap();
dict.to_object(py)
})
.collect::<Vec<_>>()
});
Ok(versions_as_dict)
})
}
pub fn checkout(self_: PyRef<'_, Self>, version: u64) -> PyResult<Bound<'_, PyAny>> {
let inner = self_.inner_ref()?.clone();
future_into_py(self_.py(), async move {

View File

@@ -19,7 +19,7 @@ use http::header::CONTENT_TYPE;
use http::StatusCode;
use lance::arrow::json::JsonSchema;
use lance::dataset::scanner::DatasetRecordBatchStream;
use lance::dataset::{ColumnAlteration, NewColumnTransform};
use lance::dataset::{ColumnAlteration, NewColumnTransform, Version};
use lance_datafusion::exec::OneShotExec;
use serde::{Deserialize, Serialize};
use tokio::sync::RwLock;
@@ -363,6 +363,34 @@ impl<S: HttpSend> TableInternal for RemoteTable<S> {
message: "restore is not supported on LanceDB cloud.".into(),
})
}
async fn list_versions(&self) -> Result<Vec<Version>> {
let request = self
.client
.post(&format!("/v1/table/{}/version/list/", self.name));
let (request_id, response) = self.client.send(request, true).await?;
let response = self.check_table_response(&request_id, response).await?;
#[derive(Deserialize)]
struct ListVersionsResponse {
versions: Vec<Version>,
}
let body = response.text().await.err_to_http(request_id.clone())?;
let body: ListVersionsResponse =
serde_json::from_str(&body).map_err(|err| Error::Http {
source: format!(
"Failed to parse list_versions response: {}, body: {}",
err, body
)
.into(),
request_id,
status_code: None,
})?;
Ok(body.versions)
}
async fn schema(&self) -> Result<SchemaRef> {
let schema = self.describe().await?.schema;
Ok(Arc::new(schema.try_into()?))
@@ -775,6 +803,7 @@ mod tests {
use arrow::{array::AsArray, compute::concat_batches, datatypes::Int32Type};
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator};
use arrow_schema::{DataType, Field, Schema};
use chrono::{DateTime, Utc};
use futures::{future::BoxFuture, StreamExt, TryFutureExt};
use lance_index::scalar::FullTextSearchQuery;
use reqwest::Body;
@@ -1489,6 +1518,51 @@ mod tests {
assert_eq!(indices, expected);
}
#[tokio::test]
async fn test_list_versions() {
let table = Table::new_with_handler("my_table", |request| {
assert_eq!(request.method(), "POST");
assert_eq!(request.url().path(), "/v1/table/my_table/version/list/");
let version1 = lance::dataset::Version {
version: 1,
timestamp: "2024-01-01T00:00:00Z".parse().unwrap(),
metadata: Default::default(),
};
let version2 = lance::dataset::Version {
version: 2,
timestamp: "2024-02-01T00:00:00Z".parse().unwrap(),
metadata: Default::default(),
};
let response_body = serde_json::json!({
"versions": [
version1,
version2,
]
});
let response_body = serde_json::to_string(&response_body).unwrap();
http::Response::builder()
.status(200)
.body(response_body)
.unwrap()
});
let versions = table.list_versions().await.unwrap();
assert_eq!(versions.len(), 2);
assert_eq!(versions[0].version, 1);
assert_eq!(
versions[0].timestamp,
"2024-01-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
);
assert_eq!(versions[1].version, 2);
assert_eq!(
versions[1].timestamp,
"2024-02-01T00:00:00Z".parse::<DateTime<Utc>>().unwrap()
);
// assert_eq!(versions, expected);
}
#[tokio::test]
async fn test_index_stats() {
let table = Table::new_with_handler("my_table", |request| {

View File

@@ -37,7 +37,7 @@ pub use lance::dataset::ColumnAlteration;
pub use lance::dataset::NewColumnTransform;
pub use lance::dataset::ReadParams;
use lance::dataset::{
Dataset, UpdateBuilder as LanceUpdateBuilder, WhenMatched, WriteMode, WriteParams,
Dataset, UpdateBuilder as LanceUpdateBuilder, Version, WhenMatched, WriteMode, WriteParams,
};
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
use lance::io::WrappingObjectStore;
@@ -426,6 +426,7 @@ pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Syn
async fn checkout(&self, version: u64) -> Result<()>;
async fn checkout_latest(&self) -> Result<()>;
async fn restore(&self) -> Result<()>;
async fn list_versions(&self) -> Result<Vec<Version>>;
async fn table_definition(&self) -> Result<TableDefinition>;
fn dataset_uri(&self) -> &str;
}
@@ -955,6 +956,11 @@ impl Table {
self.inner.restore().await
}
/// List all the versions of the table
pub async fn list_versions(&self) -> Result<Vec<Version>> {
self.inner.list_versions().await
}
/// List all indices that have been created with [`Self::create_index`]
pub async fn list_indices(&self) -> Result<Vec<IndexConfig>> {
self.inner.list_indices().await
@@ -1319,7 +1325,7 @@ impl NativeTable {
let (indices, mf) = futures::try_join!(dataset.load_indices(), dataset.latest_manifest())?;
Ok(indices
.iter()
.map(|i| VectorIndex::new_from_format(&mf, i))
.map(|i| VectorIndex::new_from_format(&(mf.0), i))
.collect())
}
@@ -1707,6 +1713,10 @@ impl TableInternal for NativeTable {
self.dataset.reload().await
}
async fn list_versions(&self) -> Result<Vec<Version>> {
Ok(self.dataset.get().await?.versions().await?)
}
async fn restore(&self) -> Result<()> {
let version =
self.dataset