mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-23 21:39:57 +00:00
Compare commits
16 Commits
rmeng/patc
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
43f920182a | ||
|
|
718963d1fb | ||
|
|
e4dac751e7 | ||
|
|
aae02953eb | ||
|
|
1d9f76bdda | ||
|
|
affdfc4d48 | ||
|
|
41b77f5e25 | ||
|
|
eb8b3b8c54 | ||
|
|
f69c3e0595 | ||
|
|
8511edaaab | ||
|
|
657aba3c05 | ||
|
|
2e197ef387 | ||
|
|
4f512af024 | ||
|
|
5349e8b1db | ||
|
|
5e01810438 | ||
|
|
6eaaee59f8 |
@@ -1,22 +0,0 @@
|
|||||||
[bumpversion]
|
|
||||||
current_version = 0.4.20
|
|
||||||
commit = True
|
|
||||||
message = Bump version: {current_version} → {new_version}
|
|
||||||
tag = True
|
|
||||||
tag_name = v{new_version}
|
|
||||||
|
|
||||||
[bumpversion:file:node/package.json]
|
|
||||||
|
|
||||||
[bumpversion:file:nodejs/package.json]
|
|
||||||
|
|
||||||
[bumpversion:file:nodejs/npm/darwin-x64/package.json]
|
|
||||||
|
|
||||||
[bumpversion:file:nodejs/npm/darwin-arm64/package.json]
|
|
||||||
|
|
||||||
[bumpversion:file:nodejs/npm/linux-x64-gnu/package.json]
|
|
||||||
|
|
||||||
[bumpversion:file:nodejs/npm/linux-arm64-gnu/package.json]
|
|
||||||
|
|
||||||
[bumpversion:file:rust/ffi/node/Cargo.toml]
|
|
||||||
|
|
||||||
[bumpversion:file:rust/lancedb/Cargo.toml]
|
|
||||||
57
.bumpversion.toml
Normal file
57
.bumpversion.toml
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
[tool.bumpversion]
|
||||||
|
current_version = "0.5.0"
|
||||||
|
parse = """(?x)
|
||||||
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
(?P<patch>0|[1-9]\\d*)
|
||||||
|
(?:-(?P<pre_l>[a-zA-Z-]+)\\.(?P<pre_n>0|[1-9]\\d*))?
|
||||||
|
"""
|
||||||
|
serialize = [
|
||||||
|
"{major}.{minor}.{patch}-{pre_l}.{pre_n}",
|
||||||
|
"{major}.{minor}.{patch}",
|
||||||
|
]
|
||||||
|
search = "{current_version}"
|
||||||
|
replace = "{new_version}"
|
||||||
|
regex = false
|
||||||
|
ignore_missing_version = false
|
||||||
|
ignore_missing_files = false
|
||||||
|
tag = true
|
||||||
|
sign_tags = false
|
||||||
|
tag_name = "v{new_version}"
|
||||||
|
tag_message = "Bump version: {current_version} → {new_version}"
|
||||||
|
allow_dirty = true
|
||||||
|
commit = true
|
||||||
|
message = "Bump version: {current_version} → {new_version}"
|
||||||
|
commit_args = ""
|
||||||
|
|
||||||
|
[tool.bumpversion.parts.pre_l]
|
||||||
|
values = ["beta", "final"]
|
||||||
|
optional_value = "final"
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
filename = "node/package.json"
|
||||||
|
search = "\"version\": \"{current_version}\","
|
||||||
|
replace = "\"version\": \"{new_version}\","
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
filename = "nodejs/package.json"
|
||||||
|
search = "\"version\": \"{current_version}\","
|
||||||
|
replace = "\"version\": \"{new_version}\","
|
||||||
|
|
||||||
|
# nodejs binary packages
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
glob = "nodejs/npm/*/package.json"
|
||||||
|
search = "\"version\": \"{current_version}\","
|
||||||
|
replace = "\"version\": \"{new_version}\","
|
||||||
|
|
||||||
|
# Cargo files
|
||||||
|
# ------------
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
filename = "rust/ffi/node/Cargo.toml"
|
||||||
|
search = "\nversion = \"{current_version}\""
|
||||||
|
replace = "\nversion = \"{new_version}\""
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
filename = "rust/lancedb/Cargo.toml"
|
||||||
|
search = "\nversion = \"{current_version}\""
|
||||||
|
replace = "\nversion = \"{new_version}\""
|
||||||
25
.github/release.yml
vendored
25
.github/release.yml
vendored
@@ -1,25 +0,0 @@
|
|||||||
# TODO: create separate templates for Python and other releases.
|
|
||||||
changelog:
|
|
||||||
exclude:
|
|
||||||
labels:
|
|
||||||
- ci
|
|
||||||
- chore
|
|
||||||
categories:
|
|
||||||
- title: Breaking Changes 🛠
|
|
||||||
labels:
|
|
||||||
- breaking-change
|
|
||||||
- title: New Features 🎉
|
|
||||||
labels:
|
|
||||||
- enhancement
|
|
||||||
- title: Bug Fixes 🐛
|
|
||||||
labels:
|
|
||||||
- bug
|
|
||||||
- title: Documentation 📚
|
|
||||||
labels:
|
|
||||||
- documentation
|
|
||||||
- title: Performance Improvements 🚀
|
|
||||||
labels:
|
|
||||||
- performance
|
|
||||||
- title: Other Changes
|
|
||||||
labels:
|
|
||||||
- "*"
|
|
||||||
41
.github/release_notes.json
vendored
Normal file
41
.github/release_notes.json
vendored
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
{
|
||||||
|
"ignore_labels": ["chore"],
|
||||||
|
"pr_template": "- ${{TITLE}} by @${{AUTHOR}} in ${{URL}}",
|
||||||
|
"categories": [
|
||||||
|
{
|
||||||
|
"title": "## 🏆 Highlights",
|
||||||
|
"labels": ["highlight"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "## 🛠 Breaking Changes",
|
||||||
|
"labels": ["breaking-change"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "## ⚠️ Deprecations ",
|
||||||
|
"labels": ["deprecation"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "## 🎉 New Features",
|
||||||
|
"labels": ["enhancement"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "## 🐛 Bug Fixes",
|
||||||
|
"labels": ["bug"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "## 📚 Documentation",
|
||||||
|
"labels": ["documentation"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "## 🚀 Performance Improvements",
|
||||||
|
"labels": ["performance"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "## Other Changes"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "## 🔧 Build and CI",
|
||||||
|
"labels": ["ci"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
8
.github/workflows/cargo-publish.yml
vendored
8
.github/workflows/cargo-publish.yml
vendored
@@ -1,8 +1,12 @@
|
|||||||
name: Cargo Publish
|
name: Cargo Publish
|
||||||
|
|
||||||
on:
|
on:
|
||||||
release:
|
push:
|
||||||
types: [ published ]
|
tags-ignore:
|
||||||
|
# We don't publish pre-releases for Rust. Crates.io is just a source
|
||||||
|
# distribution, so we don't need to publish pre-releases.
|
||||||
|
- 'v*-beta*'
|
||||||
|
- '*-v*' # for example, python-vX.Y.Z
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
||||||
|
|||||||
86
.github/workflows/make-release-commit.yml
vendored
86
.github/workflows/make-release-commit.yml
vendored
@@ -1,37 +1,62 @@
|
|||||||
name: Create release commit
|
name: Create release commit
|
||||||
|
|
||||||
|
# This workflow increments versions, tags the version, and pushes it.
|
||||||
|
# When a tag is pushed, another workflow is triggered that creates a GH release
|
||||||
|
# and uploads the binaries. This workflow is only for creating the tag.
|
||||||
|
|
||||||
|
# This script will enforce that a minor version is incremented if there are any
|
||||||
|
# breaking changes since the last minor increment. However, it isn't able to
|
||||||
|
# differentiate between breaking changes in Node versus Python. If you wish to
|
||||||
|
# bypass this check, you can manually increment the version and push the tag.
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
dry_run:
|
dry_run:
|
||||||
description: 'Dry run (create the local commit/tags but do not push it)'
|
description: 'Dry run (create the local commit/tags but do not push it)'
|
||||||
required: true
|
required: true
|
||||||
default: "false"
|
default: false
|
||||||
type: choice
|
type: boolean
|
||||||
options:
|
type:
|
||||||
- "true"
|
|
||||||
- "false"
|
|
||||||
part:
|
|
||||||
description: 'What kind of release is this?'
|
description: 'What kind of release is this?'
|
||||||
required: true
|
required: true
|
||||||
default: 'patch'
|
default: 'preview'
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- patch
|
- preview
|
||||||
- minor
|
- stable
|
||||||
- major
|
python:
|
||||||
|
description: 'Make a Python release'
|
||||||
|
required: true
|
||||||
|
default: true
|
||||||
|
type: boolean
|
||||||
|
other:
|
||||||
|
description: 'Make a Node/Rust release'
|
||||||
|
required: true
|
||||||
|
default: true
|
||||||
|
type: boolean
|
||||||
|
bump-minor:
|
||||||
|
description: 'Bump minor version'
|
||||||
|
required: true
|
||||||
|
default: false
|
||||||
|
type: boolean
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
bump-version:
|
make-release:
|
||||||
|
# Creates tag and GH release. The GH release will trigger the build and release jobs.
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
steps:
|
steps:
|
||||||
- name: Check out main
|
- name: Output Inputs
|
||||||
uses: actions/checkout@v4
|
run: echo "${{ toJSON(github.event.inputs) }}"
|
||||||
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
ref: main
|
|
||||||
persist-credentials: false
|
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
lfs: true
|
lfs: true
|
||||||
|
# It's important we use our token here, as the default token will NOT
|
||||||
|
# trigger any workflows watching for new tags. See:
|
||||||
|
# https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow
|
||||||
|
token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
- name: Set git configs for bumpversion
|
- name: Set git configs for bumpversion
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
@@ -41,19 +66,34 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- name: Bump version, create tag and commit
|
- name: Bump Python version
|
||||||
|
if: ${{ inputs.python }}
|
||||||
|
working-directory: python
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
pip install bump2version
|
# Need to get the commit before bumping the version, so we can
|
||||||
bumpversion --verbose ${{ inputs.part }}
|
# determine if there are breaking changes in the next step as well.
|
||||||
- name: Push new version and tag
|
echo "COMMIT_BEFORE_BUMP=$(git rev-parse HEAD)" >> $GITHUB_ENV
|
||||||
if: ${{ inputs.dry_run }} == "false"
|
|
||||||
|
pip install bump-my-version PyGithub packaging
|
||||||
|
bash ../ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} python-v
|
||||||
|
- name: Bump Node/Rust version
|
||||||
|
if: ${{ inputs.other }}
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
run: |
|
||||||
|
pip install bump-my-version PyGithub packaging
|
||||||
|
bash ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} v $COMMIT_BEFORE_BUMP
|
||||||
|
- name: Push new version tag
|
||||||
|
if: ${{ !inputs.dry_run }}
|
||||||
uses: ad-m/github-push-action@master
|
uses: ad-m/github-push-action@master
|
||||||
with:
|
with:
|
||||||
|
# Need to use PAT here too to trigger next workflow. See comment above.
|
||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
branch: main
|
branch: ${{ github.ref }}
|
||||||
tags: true
|
tags: true
|
||||||
- uses: ./.github/workflows/update_package_lock
|
- uses: ./.github/workflows/update_package_lock
|
||||||
if: ${{ inputs.dry_run }} == "false"
|
if: ${{ inputs.dry_run }} == "false"
|
||||||
with:
|
with:
|
||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
|||||||
99
.github/workflows/npm-publish.yml
vendored
99
.github/workflows/npm-publish.yml
vendored
@@ -1,8 +1,9 @@
|
|||||||
name: NPM Publish
|
name: NPM Publish
|
||||||
|
|
||||||
on:
|
on:
|
||||||
release:
|
push:
|
||||||
types: [published]
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
node:
|
node:
|
||||||
@@ -274,9 +275,15 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
|
# Tag beta as "preview" instead of default "latest". See lancedb
|
||||||
|
# npm publish step for more info.
|
||||||
|
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
||||||
|
PUBLISH_ARGS="--tag preview"
|
||||||
|
fi
|
||||||
|
|
||||||
mv */*.tgz .
|
mv */*.tgz .
|
||||||
for filename in *.tgz; do
|
for filename in *.tgz; do
|
||||||
npm publish $filename
|
npm publish $PUBLISH_ARGS $filename
|
||||||
done
|
done
|
||||||
|
|
||||||
release-nodejs:
|
release-nodejs:
|
||||||
@@ -316,11 +323,23 @@ jobs:
|
|||||||
- name: Publish to NPM
|
- name: Publish to NPM
|
||||||
env:
|
env:
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||||
run: npm publish --access public
|
# By default, things are published to the latest tag. This is what is
|
||||||
|
# installed by default if the user does not specify a version. This is
|
||||||
|
# good for stable releases, but for pre-releases, we want to publish to
|
||||||
|
# the "preview" tag so they can install with `npm install lancedb@preview`.
|
||||||
|
# See: https://medium.com/@mbostock/prereleases-and-npm-e778fc5e2420
|
||||||
|
run: |
|
||||||
|
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
||||||
|
npm publish --access public --tag preview
|
||||||
|
else
|
||||||
|
npm publish --access public
|
||||||
|
fi
|
||||||
|
|
||||||
update-package-lock:
|
update-package-lock:
|
||||||
needs: [release]
|
needs: [release]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -331,11 +350,13 @@ jobs:
|
|||||||
lfs: true
|
lfs: true
|
||||||
- uses: ./.github/workflows/update_package_lock
|
- uses: ./.github/workflows/update_package_lock
|
||||||
with:
|
with:
|
||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
update-package-lock-nodejs:
|
update-package-lock-nodejs:
|
||||||
needs: [release-nodejs]
|
needs: [release-nodejs]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -346,4 +367,70 @@ jobs:
|
|||||||
lfs: true
|
lfs: true
|
||||||
- uses: ./.github/workflows/update_package_lock_nodejs
|
- uses: ./.github/workflows/update_package_lock_nodejs
|
||||||
with:
|
with:
|
||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
gh-release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
lfs: true
|
||||||
|
- name: Extract version
|
||||||
|
id: extract_version
|
||||||
|
env:
|
||||||
|
GITHUB_REF: ${{ github.ref }}
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
echo "Extracting tag and version from $GITHUB_REF"
|
||||||
|
if [[ $GITHUB_REF =~ refs/tags/v(.*) ]]; then
|
||||||
|
VERSION=${BASH_REMATCH[1]}
|
||||||
|
TAG=v$VERSION
|
||||||
|
echo "tag=$TAG" >> $GITHUB_OUTPUT
|
||||||
|
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||||
|
else
|
||||||
|
echo "Failed to extract version from $GITHUB_REF"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Extracted version $VERSION from $GITHUB_REF"
|
||||||
|
if [[ $VERSION =~ beta ]]; then
|
||||||
|
echo "This is a beta release"
|
||||||
|
|
||||||
|
# Get last release (that is not this one)
|
||||||
|
FROM_TAG=$(git tag --sort='version:refname' \
|
||||||
|
| grep ^v \
|
||||||
|
| grep -vF "$TAG" \
|
||||||
|
| python ci/semver_sort.py v \
|
||||||
|
| tail -n 1)
|
||||||
|
else
|
||||||
|
echo "This is a stable release"
|
||||||
|
# Get last stable tag (ignore betas)
|
||||||
|
FROM_TAG=$(git tag --sort='version:refname' \
|
||||||
|
| grep ^v \
|
||||||
|
| grep -vF "$TAG" \
|
||||||
|
| grep -v beta \
|
||||||
|
| python ci/semver_sort.py v \
|
||||||
|
| tail -n 1)
|
||||||
|
fi
|
||||||
|
echo "Found from tag $FROM_TAG"
|
||||||
|
echo "from_tag=$FROM_TAG" >> $GITHUB_OUTPUT
|
||||||
|
- name: Create Release Notes
|
||||||
|
id: release_notes
|
||||||
|
uses: mikepenz/release-changelog-builder-action@v4
|
||||||
|
with:
|
||||||
|
configuration: .github/release_notes.json
|
||||||
|
toTag: ${{ steps.extract_version.outputs.tag }}
|
||||||
|
fromTag: ${{ steps.extract_version.outputs.from_tag }}
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
- name: Create GH release
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
prerelease: ${{ contains('beta', github.ref) }}
|
||||||
|
tag_name: ${{ steps.extract_version.outputs.tag }}
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
generate_release_notes: false
|
||||||
|
name: Node/Rust LanceDB v${{ steps.extract_version.outputs.version }}
|
||||||
|
body: ${{ steps.release_notes.outputs.changelog }}
|
||||||
|
|||||||
107
.github/workflows/pypi-publish.yml
vendored
107
.github/workflows/pypi-publish.yml
vendored
@@ -1,18 +1,16 @@
|
|||||||
name: PyPI Publish
|
name: PyPI Publish
|
||||||
|
|
||||||
on:
|
on:
|
||||||
release:
|
push:
|
||||||
types: [published]
|
tags:
|
||||||
|
- 'python-v*'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
linux:
|
linux:
|
||||||
# Only runs on tags that matches the python-make-release action
|
|
||||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
|
||||||
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-minor-version: ["8"]
|
|
||||||
config:
|
config:
|
||||||
- platform: x86_64
|
- platform: x86_64
|
||||||
manylinux: "2_17"
|
manylinux: "2_17"
|
||||||
@@ -34,25 +32,22 @@ jobs:
|
|||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: 3.${{ matrix.python-minor-version }}
|
python-version: 3.8
|
||||||
- uses: ./.github/workflows/build_linux_wheel
|
- uses: ./.github/workflows/build_linux_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: ${{ matrix.python-minor-version }}
|
python-minor-version: 8
|
||||||
args: "--release --strip ${{ matrix.config.extra_args }}"
|
args: "--release --strip ${{ matrix.config.extra_args }}"
|
||||||
arm-build: ${{ matrix.config.platform == 'aarch64' }}
|
arm-build: ${{ matrix.config.platform == 'aarch64' }}
|
||||||
manylinux: ${{ matrix.config.manylinux }}
|
manylinux: ${{ matrix.config.manylinux }}
|
||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
with:
|
with:
|
||||||
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
repo: "pypi"
|
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||||
mac:
|
mac:
|
||||||
# Only runs on tags that matches the python-make-release action
|
|
||||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-minor-version: ["8"]
|
|
||||||
config:
|
config:
|
||||||
- target: x86_64-apple-darwin
|
- target: x86_64-apple-darwin
|
||||||
runner: macos-13
|
runner: macos-13
|
||||||
@@ -63,7 +58,6 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
ref: ${{ inputs.ref }}
|
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
lfs: true
|
lfs: true
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
@@ -72,38 +66,95 @@ jobs:
|
|||||||
python-version: 3.12
|
python-version: 3.12
|
||||||
- uses: ./.github/workflows/build_mac_wheel
|
- uses: ./.github/workflows/build_mac_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: ${{ matrix.python-minor-version }}
|
python-minor-version: 8
|
||||||
args: "--release --strip --target ${{ matrix.config.target }} --features fp16kernels"
|
args: "--release --strip --target ${{ matrix.config.target }} --features fp16kernels"
|
||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: ${{ matrix.python-minor-version }}
|
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||||
repo: "pypi"
|
|
||||||
windows:
|
windows:
|
||||||
# Only runs on tags that matches the python-make-release action
|
|
||||||
if: startsWith(github.ref, 'refs/tags/python-v')
|
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-minor-version: ["8"]
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
ref: ${{ inputs.ref }}
|
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
lfs: true
|
lfs: true
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: 3.${{ matrix.python-minor-version }}
|
python-version: 3.8
|
||||||
- uses: ./.github/workflows/build_windows_wheel
|
- uses: ./.github/workflows/build_windows_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: ${{ matrix.python-minor-version }}
|
python-minor-version: 8
|
||||||
args: "--release --strip"
|
args: "--release --strip"
|
||||||
vcpkg_token: ${{ secrets.VCPKG_GITHUB_PACKAGES }}
|
vcpkg_token: ${{ secrets.VCPKG_GITHUB_PACKAGES }}
|
||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: ${{ matrix.python-minor-version }}
|
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||||
repo: "pypi"
|
gh-release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
lfs: true
|
||||||
|
- name: Extract version
|
||||||
|
id: extract_version
|
||||||
|
env:
|
||||||
|
GITHUB_REF: ${{ github.ref }}
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
echo "Extracting tag and version from $GITHUB_REF"
|
||||||
|
if [[ $GITHUB_REF =~ refs/tags/python-v(.*) ]]; then
|
||||||
|
VERSION=${BASH_REMATCH[1]}
|
||||||
|
TAG=python-v$VERSION
|
||||||
|
echo "tag=$TAG" >> $GITHUB_OUTPUT
|
||||||
|
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
||||||
|
else
|
||||||
|
echo "Failed to extract version from $GITHUB_REF"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Extracted version $VERSION from $GITHUB_REF"
|
||||||
|
if [[ $VERSION =~ beta ]]; then
|
||||||
|
echo "This is a beta release"
|
||||||
|
|
||||||
|
# Get last release (that is not this one)
|
||||||
|
FROM_TAG=$(git tag --sort='version:refname' \
|
||||||
|
| grep ^python-v \
|
||||||
|
| grep -vF "$TAG" \
|
||||||
|
| python ci/semver_sort.py python-v \
|
||||||
|
| tail -n 1)
|
||||||
|
else
|
||||||
|
echo "This is a stable release"
|
||||||
|
# Get last stable tag (ignore betas)
|
||||||
|
FROM_TAG=$(git tag --sort='version:refname' \
|
||||||
|
| grep ^python-v \
|
||||||
|
| grep -vF "$TAG" \
|
||||||
|
| grep -v beta \
|
||||||
|
| python ci/semver_sort.py python-v \
|
||||||
|
| tail -n 1)
|
||||||
|
fi
|
||||||
|
echo "Found from tag $FROM_TAG"
|
||||||
|
echo "from_tag=$FROM_TAG" >> $GITHUB_OUTPUT
|
||||||
|
- name: Create Python Release Notes
|
||||||
|
id: python_release_notes
|
||||||
|
uses: mikepenz/release-changelog-builder-action@v4
|
||||||
|
with:
|
||||||
|
configuration: .github/release_notes.json
|
||||||
|
toTag: ${{ steps.extract_version.outputs.tag }}
|
||||||
|
fromTag: ${{ steps.extract_version.outputs.from_tag }}
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
- name: Create Python GH release
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
prerelease: ${{ contains('beta', github.ref) }}
|
||||||
|
tag_name: ${{ steps.extract_version.outputs.tag }}
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
generate_release_notes: false
|
||||||
|
name: Python LanceDB v${{ steps.extract_version.outputs.version }}
|
||||||
|
body: ${{ steps.python_release_notes.outputs.changelog }}
|
||||||
|
|||||||
56
.github/workflows/python-make-release-commit.yml
vendored
56
.github/workflows/python-make-release-commit.yml
vendored
@@ -1,56 +0,0 @@
|
|||||||
name: Python - Create release commit
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
dry_run:
|
|
||||||
description: 'Dry run (create the local commit/tags but do not push it)'
|
|
||||||
required: true
|
|
||||||
default: "false"
|
|
||||||
type: choice
|
|
||||||
options:
|
|
||||||
- "true"
|
|
||||||
- "false"
|
|
||||||
part:
|
|
||||||
description: 'What kind of release is this?'
|
|
||||||
required: true
|
|
||||||
default: 'patch'
|
|
||||||
type: choice
|
|
||||||
options:
|
|
||||||
- patch
|
|
||||||
- minor
|
|
||||||
- major
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
bump-version:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Check out main
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: main
|
|
||||||
persist-credentials: false
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- name: Set git configs for bumpversion
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config user.name 'Lance Release'
|
|
||||||
git config user.email 'lance-dev@lancedb.com'
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Bump version, create tag and commit
|
|
||||||
working-directory: python
|
|
||||||
run: |
|
|
||||||
pip install bump2version
|
|
||||||
bumpversion --verbose ${{ inputs.part }}
|
|
||||||
- name: Push new version and tag
|
|
||||||
if: ${{ inputs.dry_run }} == "false"
|
|
||||||
uses: ad-m/github-push-action@master
|
|
||||||
with:
|
|
||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
|
||||||
branch: main
|
|
||||||
tags: true
|
|
||||||
|
|
||||||
2
.github/workflows/python.yml
vendored
2
.github/workflows/python.yml
vendored
@@ -75,7 +75,7 @@ jobs:
|
|||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-minor-version: ["8", "11"]
|
python-minor-version: ["9", "11"]
|
||||||
runs-on: "ubuntu-22.04"
|
runs-on: "ubuntu-22.04"
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
|
|||||||
4
.github/workflows/rust.yml
vendored
4
.github/workflows/rust.yml
vendored
@@ -74,11 +74,11 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
- name: Build
|
|
||||||
run: cargo build --all-features
|
|
||||||
- name: Start S3 integration test environment
|
- name: Start S3 integration test environment
|
||||||
working-directory: .
|
working-directory: .
|
||||||
run: docker compose up --detach --wait
|
run: docker compose up --detach --wait
|
||||||
|
- name: Build
|
||||||
|
run: cargo build --all-features
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: cargo test --all-features
|
run: cargo test --all-features
|
||||||
- name: Run examples
|
- name: Run examples
|
||||||
|
|||||||
53
.github/workflows/upload_wheel/action.yml
vendored
53
.github/workflows/upload_wheel/action.yml
vendored
@@ -2,28 +2,43 @@ name: upload-wheel
|
|||||||
|
|
||||||
description: "Upload wheels to Pypi"
|
description: "Upload wheels to Pypi"
|
||||||
inputs:
|
inputs:
|
||||||
os:
|
pypi_token:
|
||||||
required: true
|
|
||||||
description: "ubuntu-22.04 or macos-13"
|
|
||||||
repo:
|
|
||||||
required: false
|
|
||||||
description: "pypi or testpypi"
|
|
||||||
default: "pypi"
|
|
||||||
token:
|
|
||||||
required: true
|
required: true
|
||||||
description: "release token for the repo"
|
description: "release token for the repo"
|
||||||
|
fury_token:
|
||||||
|
required: true
|
||||||
|
description: "release token for the fury repo"
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
steps:
|
steps:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install twine
|
pip install twine
|
||||||
- name: Publish wheel
|
- name: Choose repo
|
||||||
env:
|
shell: bash
|
||||||
TWINE_USERNAME: __token__
|
id: choose_repo
|
||||||
TWINE_PASSWORD: ${{ inputs.token }}
|
run: |
|
||||||
shell: bash
|
if [ ${{ github.ref }} == "*beta*" ]; then
|
||||||
run: twine upload --repository ${{ inputs.repo }} target/wheels/lancedb-*.whl
|
echo "repo=fury" >> $GITHUB_OUTPUT
|
||||||
|
else
|
||||||
|
echo "repo=pypi" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
- name: Publish to PyPI
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
FURY_TOKEN: ${{ inputs.fury_token }}
|
||||||
|
PYPI_TOKEN: ${{ inputs.pypi_token }}
|
||||||
|
run: |
|
||||||
|
if [ ${{ steps.choose_repo.outputs.repo }} == "fury" ]; then
|
||||||
|
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
|
||||||
|
echo "Uploading $WHEEL to Fury"
|
||||||
|
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
|
||||||
|
else
|
||||||
|
twine upload --repository ${{ steps.choose_repo.outputs.repo }} \
|
||||||
|
--username __token__ \
|
||||||
|
--password $PYPI_TOKEN \
|
||||||
|
target/wheels/lancedb-*.whl
|
||||||
|
fi
|
||||||
|
|||||||
@@ -14,10 +14,10 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
|
|||||||
categories = ["database-implementations"]
|
categories = ["database-implementations"]
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.10.18", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.11.0", "features" = ["dynamodb"] }
|
||||||
lance-index = { "version" = "=0.10.18" }
|
lance-index = { "version" = "=0.11.0" }
|
||||||
lance-linalg = { "version" = "=0.10.18" }
|
lance-linalg = { "version" = "=0.11.0" }
|
||||||
lance-testing = { "version" = "=0.10.18" }
|
lance-testing = { "version" = "=0.11.0" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "51.0", optional = false }
|
arrow = { version = "51.0", optional = false }
|
||||||
arrow-array = "51.0"
|
arrow-array = "51.0"
|
||||||
|
|||||||
51
ci/bump_version.sh
Normal file
51
ci/bump_version.sh
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
set -e
|
||||||
|
|
||||||
|
RELEASE_TYPE=${1:-"stable"}
|
||||||
|
BUMP_MINOR=${2:-false}
|
||||||
|
TAG_PREFIX=${3:-"v"} # Such as "python-v"
|
||||||
|
HEAD_SHA=${4:-$(git rev-parse HEAD)}
|
||||||
|
|
||||||
|
readonly SELF_DIR=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
||||||
|
|
||||||
|
PREV_TAG=$(git tag --sort='version:refname' | grep ^$TAG_PREFIX | python $SELF_DIR/semver_sort.py $TAG_PREFIX | tail -n 1)
|
||||||
|
echo "Found previous tag $PREV_TAG"
|
||||||
|
|
||||||
|
# Initially, we don't want to tag if we are doing stable, because we will bump
|
||||||
|
# again later. See comment at end for why.
|
||||||
|
if [[ "$RELEASE_TYPE" == 'stable' ]]; then
|
||||||
|
BUMP_ARGS="--no-tag"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If last is stable and not bumping minor
|
||||||
|
if [[ $PREV_TAG != *beta* ]]; then
|
||||||
|
if [[ "$BUMP_MINOR" != "false" ]]; then
|
||||||
|
# X.Y.Z -> X.(Y+1).0-beta.0
|
||||||
|
bump-my-version bump -vv $BUMP_ARGS minor
|
||||||
|
else
|
||||||
|
# X.Y.Z -> X.Y.(Z+1)-beta.0
|
||||||
|
bump-my-version bump -vv $BUMP_ARGS patch
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if [[ "$BUMP_MINOR" != "false" ]]; then
|
||||||
|
# X.Y.Z-beta.N -> X.(Y+1).0-beta.0
|
||||||
|
bump-my-version bump -vv $BUMP_ARGS minor
|
||||||
|
else
|
||||||
|
# X.Y.Z-beta.N -> X.Y.Z-beta.(N+1)
|
||||||
|
bump-my-version bump -vv $BUMP_ARGS pre_n
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# The above bump will always bump to a pre-release version. If we are releasing
|
||||||
|
# a stable version, bump the pre-release level ("pre_l") to make it stable.
|
||||||
|
if [[ $RELEASE_TYPE == 'stable' ]]; then
|
||||||
|
# X.Y.Z-beta.N -> X.Y.Z
|
||||||
|
bump-my-version bump -vv pre_l
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Validate that we have incremented version appropriately for breaking changes
|
||||||
|
NEW_TAG=$(git describe --tags --exact-match HEAD)
|
||||||
|
NEW_VERSION=$(echo $NEW_TAG | sed "s/^$TAG_PREFIX//")
|
||||||
|
LAST_STABLE_RELEASE=$(git tag --sort='version:refname' | grep ^$TAG_PREFIX | grep -v beta | grep -vF "$NEW_TAG" | python $SELF_DIR/semver_sort.py $TAG_PREFIX | tail -n 1)
|
||||||
|
LAST_STABLE_VERSION=$(echo $LAST_STABLE_RELEASE | sed "s/^$TAG_PREFIX//")
|
||||||
|
|
||||||
|
python $SELF_DIR/check_breaking_changes.py $LAST_STABLE_RELEASE $HEAD_SHA $LAST_STABLE_VERSION $NEW_VERSION
|
||||||
35
ci/check_breaking_changes.py
Normal file
35
ci/check_breaking_changes.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""
|
||||||
|
Check whether there are any breaking changes in the PRs between the base and head commits.
|
||||||
|
If there are, assert that we have incremented the minor version.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from packaging.version import parse
|
||||||
|
|
||||||
|
from github import Github
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("base")
|
||||||
|
parser.add_argument("head")
|
||||||
|
parser.add_argument("last_stable_version")
|
||||||
|
parser.add_argument("current_version")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
repo = Github(os.environ["GITHUB_TOKEN"]).get_repo(os.environ["GITHUB_REPOSITORY"])
|
||||||
|
commits = repo.compare(args.base, args.head).commits
|
||||||
|
prs = (pr for commit in commits for pr in commit.get_pulls())
|
||||||
|
|
||||||
|
for pr in prs:
|
||||||
|
if any(label.name == "breaking-change" for label in pr.labels):
|
||||||
|
print(f"Breaking change in PR: {pr.html_url}")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("No breaking changes found.")
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
last_stable_version = parse(args.last_stable_version)
|
||||||
|
current_version = parse(args.current_version)
|
||||||
|
if current_version.minor <= last_stable_version.minor:
|
||||||
|
print("Minor version is not greater than the last stable version.")
|
||||||
|
exit(1)
|
||||||
35
ci/semver_sort.py
Normal file
35
ci/semver_sort.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
"""
|
||||||
|
Takes a list of semver strings and sorts them in ascending order.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from packaging.version import parse, InvalidVersion
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("prefix", default="v")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Read the input from stdin
|
||||||
|
lines = sys.stdin.readlines()
|
||||||
|
|
||||||
|
# Parse the versions
|
||||||
|
versions = []
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
try:
|
||||||
|
version_str = line.removeprefix(args.prefix)
|
||||||
|
version = parse(version_str)
|
||||||
|
except InvalidVersion:
|
||||||
|
# There are old tags that don't follow the semver format
|
||||||
|
print(f"Invalid version: {line}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
versions.append((line, version))
|
||||||
|
|
||||||
|
# Sort the versions
|
||||||
|
versions.sort(key=lambda x: x[1])
|
||||||
|
|
||||||
|
# Print the sorted versions as original strings
|
||||||
|
for line, _ in versions:
|
||||||
|
print(line)
|
||||||
@@ -44,6 +44,36 @@
|
|||||||
|
|
||||||
!!! info "Please also make sure you're using the same version of Arrow as in the [lancedb crate](https://github.com/lancedb/lancedb/blob/main/Cargo.toml)"
|
!!! info "Please also make sure you're using the same version of Arrow as in the [lancedb crate](https://github.com/lancedb/lancedb/blob/main/Cargo.toml)"
|
||||||
|
|
||||||
|
### Preview releases
|
||||||
|
|
||||||
|
Stable releases are created about every 2 weeks. For the latest features and bug
|
||||||
|
fixes, you can install the preview release. These releases receive the same
|
||||||
|
level of testing as stable releases, but are not guaranteed to be available for
|
||||||
|
more than 6 months after they are released. Once your application is stable, we
|
||||||
|
recommend switching to stable releases.
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install --pre --extra-index-url https://pypi.fury.io/lancedb/ lancedb
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Typescript"
|
||||||
|
|
||||||
|
```shell
|
||||||
|
npm install vectordb@preview
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Rust"
|
||||||
|
|
||||||
|
We don't push preview releases to crates.io, but you can referent the tag
|
||||||
|
in GitHub within your Cargo dependencies:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[dependencies]
|
||||||
|
lancedb = { git = "https://github.com/lancedb/lancedb.git", tag = "vX.Y.Z-beta.N" }
|
||||||
|
```
|
||||||
|
|
||||||
## Connect to a database
|
## Connect to a database
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|||||||
4
node/package-lock.json
generated
4
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.4.20",
|
"version": "0.5.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.4.20",
|
"version": "0.5.0",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.4.20",
|
"version": "0.5.0",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
|||||||
@@ -624,8 +624,6 @@ function validateSchemaEmbeddings(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
||||||
console.log({ missingEmbeddingFields, embeddings });
|
|
||||||
|
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Table has embeddings: "${missingEmbeddingFields
|
`Table has embeddings: "${missingEmbeddingFields
|
||||||
.map((f) => f.name)
|
.map((f) => f.name)
|
||||||
@@ -633,5 +631,5 @@ function validateSchemaEmbeddings(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Schema(fields);
|
return new Schema(fields, schema.metadata);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -419,3 +419,31 @@ describe("when dealing with versioning", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("when optimizing a dataset", () => {
|
||||||
|
let tmpDir: tmp.DirResult;
|
||||||
|
let table: Table;
|
||||||
|
beforeEach(async () => {
|
||||||
|
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||||
|
const con = await connect(tmpDir.name);
|
||||||
|
table = await con.createTable("vectors", [{ id: 1 }]);
|
||||||
|
await table.add([{ id: 2 }]);
|
||||||
|
});
|
||||||
|
afterEach(() => {
|
||||||
|
tmpDir.removeCallback();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("compacts files", async () => {
|
||||||
|
const stats = await table.optimize();
|
||||||
|
expect(stats.compaction.filesAdded).toBe(1);
|
||||||
|
expect(stats.compaction.filesRemoved).toBe(2);
|
||||||
|
expect(stats.compaction.fragmentsAdded).toBe(1);
|
||||||
|
expect(stats.compaction.fragmentsRemoved).toBe(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("cleanups old versions", async () => {
|
||||||
|
const stats = await table.optimize({ cleanupOlderThan: new Date() });
|
||||||
|
expect(stats.prune.bytesRemoved).toBeGreaterThan(0);
|
||||||
|
expect(stats.prune.oldVersionsRemoved).toBe(3);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -677,8 +677,6 @@ function validateSchemaEmbeddings(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
||||||
console.log({ missingEmbeddingFields, embeddings });
|
|
||||||
|
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Table has embeddings: "${missingEmbeddingFields
|
`Table has embeddings: "${missingEmbeddingFields
|
||||||
.map((f) => f.name)
|
.map((f) => f.name)
|
||||||
@@ -686,5 +684,5 @@ function validateSchemaEmbeddings(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Schema(fields);
|
return new Schema(fields, schema.metadata);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import {
|
|||||||
AddColumnsSql,
|
AddColumnsSql,
|
||||||
ColumnAlteration,
|
ColumnAlteration,
|
||||||
IndexConfig,
|
IndexConfig,
|
||||||
|
OptimizeStats,
|
||||||
Table as _NativeTable,
|
Table as _NativeTable,
|
||||||
} from "./native";
|
} from "./native";
|
||||||
import { Query, VectorQuery } from "./query";
|
import { Query, VectorQuery } from "./query";
|
||||||
@@ -50,6 +51,23 @@ export interface UpdateOptions {
|
|||||||
where: string;
|
where: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface OptimizeOptions {
|
||||||
|
/**
|
||||||
|
* If set then all versions older than the given date
|
||||||
|
* be removed. The current version will never be removed.
|
||||||
|
* The default is 7 days
|
||||||
|
* @example
|
||||||
|
* // Delete all versions older than 1 day
|
||||||
|
* const olderThan = new Date();
|
||||||
|
* olderThan.setDate(olderThan.getDate() - 1));
|
||||||
|
* tbl.cleanupOlderVersions(olderThan);
|
||||||
|
*
|
||||||
|
* // Delete all versions except the current version
|
||||||
|
* tbl.cleanupOlderVersions(new Date());
|
||||||
|
*/
|
||||||
|
cleanupOlderThan: Date;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A Table is a collection of Records in a LanceDB Database.
|
* A Table is a collection of Records in a LanceDB Database.
|
||||||
*
|
*
|
||||||
@@ -352,6 +370,48 @@ export class Table {
|
|||||||
await this.inner.restore();
|
await this.inner.restore();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optimize the on-disk data and indices for better performance.
|
||||||
|
*
|
||||||
|
* Modeled after ``VACUUM`` in PostgreSQL.
|
||||||
|
*
|
||||||
|
* Optimization covers three operations:
|
||||||
|
*
|
||||||
|
* - Compaction: Merges small files into larger ones
|
||||||
|
* - Prune: Removes old versions of the dataset
|
||||||
|
* - Index: Optimizes the indices, adding new data to existing indices
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Experimental API
|
||||||
|
* ----------------
|
||||||
|
*
|
||||||
|
* The optimization process is undergoing active development and may change.
|
||||||
|
* Our goal with these changes is to improve the performance of optimization and
|
||||||
|
* reduce the complexity.
|
||||||
|
*
|
||||||
|
* That being said, it is essential today to run optimize if you want the best
|
||||||
|
* performance. It should be stable and safe to use in production, but it our
|
||||||
|
* hope that the API may be simplified (or not even need to be called) in the
|
||||||
|
* future.
|
||||||
|
*
|
||||||
|
* The frequency an application shoudl call optimize is based on the frequency of
|
||||||
|
* data modifications. If data is frequently added, deleted, or updated then
|
||||||
|
* optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||||
|
* you have added or modified 100,000 or more records or run more than 20 data
|
||||||
|
* modification operations.
|
||||||
|
*/
|
||||||
|
async optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats> {
|
||||||
|
let cleanupOlderThanMs;
|
||||||
|
if (
|
||||||
|
options?.cleanupOlderThan !== undefined &&
|
||||||
|
options?.cleanupOlderThan !== null
|
||||||
|
) {
|
||||||
|
cleanupOlderThanMs =
|
||||||
|
new Date().getTime() - options.cleanupOlderThan.getTime();
|
||||||
|
}
|
||||||
|
return await this.inner.optimize(cleanupOlderThanMs);
|
||||||
|
}
|
||||||
|
|
||||||
/** List all indices that have been created with {@link Table.createIndex} */
|
/** List all indices that have been created with {@link Table.createIndex} */
|
||||||
async listIndices(): Promise<IndexConfig[]> {
|
async listIndices(): Promise<IndexConfig[]> {
|
||||||
return await this.inner.listIndices();
|
return await this.inner.listIndices();
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.4.20",
|
"version": "0.5.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.4.20",
|
"version": "0.5.0",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.4.20",
|
"version": "0.5.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.4.20",
|
"version": "0.5.0",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.4.14",
|
"version": "0.5.0",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.4.20",
|
"version": "0.5.0",
|
||||||
"main": "./dist/index.js",
|
"main": "./dist/index.js",
|
||||||
"types": "./dist/index.d.ts",
|
"types": "./dist/index.d.ts",
|
||||||
"napi": {
|
"napi": {
|
||||||
|
|||||||
@@ -15,8 +15,8 @@
|
|||||||
use arrow_ipc::writer::FileWriter;
|
use arrow_ipc::writer::FileWriter;
|
||||||
use lancedb::ipc::ipc_file_to_batches;
|
use lancedb::ipc::ipc_file_to_batches;
|
||||||
use lancedb::table::{
|
use lancedb::table::{
|
||||||
AddDataMode, ColumnAlteration as LanceColumnAlteration, NewColumnTransform,
|
AddDataMode, ColumnAlteration as LanceColumnAlteration, Duration, NewColumnTransform,
|
||||||
Table as LanceDbTable,
|
OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
||||||
};
|
};
|
||||||
use napi::bindgen_prelude::*;
|
use napi::bindgen_prelude::*;
|
||||||
use napi_derive::napi;
|
use napi_derive::napi;
|
||||||
@@ -263,6 +263,60 @@ impl Table {
|
|||||||
self.inner_ref()?.restore().await.default_error()
|
self.inner_ref()?.restore().await.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[napi]
|
||||||
|
pub async fn optimize(&self, older_than_ms: Option<i64>) -> napi::Result<OptimizeStats> {
|
||||||
|
let inner = self.inner_ref()?;
|
||||||
|
|
||||||
|
let older_than = if let Some(ms) = older_than_ms {
|
||||||
|
if ms == i64::MIN {
|
||||||
|
return Err(napi::Error::from_reason(format!(
|
||||||
|
"older_than_ms can not be {}",
|
||||||
|
i32::MIN,
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
Duration::try_milliseconds(ms)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let compaction_stats = inner
|
||||||
|
.optimize(OptimizeAction::Compact {
|
||||||
|
options: lancedb::table::CompactionOptions::default(),
|
||||||
|
remap_options: None,
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.default_error()?
|
||||||
|
.compaction
|
||||||
|
.unwrap();
|
||||||
|
let prune_stats = inner
|
||||||
|
.optimize(OptimizeAction::Prune {
|
||||||
|
older_than,
|
||||||
|
delete_unverified: None,
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.default_error()?
|
||||||
|
.prune
|
||||||
|
.unwrap();
|
||||||
|
inner
|
||||||
|
.optimize(lancedb::table::OptimizeAction::Index(
|
||||||
|
OptimizeOptions::default(),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.default_error()?;
|
||||||
|
Ok(OptimizeStats {
|
||||||
|
compaction: CompactionStats {
|
||||||
|
files_added: compaction_stats.files_added as i64,
|
||||||
|
files_removed: compaction_stats.files_removed as i64,
|
||||||
|
fragments_added: compaction_stats.fragments_added as i64,
|
||||||
|
fragments_removed: compaction_stats.fragments_removed as i64,
|
||||||
|
},
|
||||||
|
prune: RemovalStats {
|
||||||
|
bytes_removed: prune_stats.bytes_removed as i64,
|
||||||
|
old_versions_removed: prune_stats.old_versions as i64,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub async fn list_indices(&self) -> napi::Result<Vec<IndexConfig>> {
|
pub async fn list_indices(&self) -> napi::Result<Vec<IndexConfig>> {
|
||||||
Ok(self
|
Ok(self
|
||||||
@@ -298,6 +352,40 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Statistics about a compaction operation.
|
||||||
|
#[napi(object)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct CompactionStats {
|
||||||
|
/// The number of fragments removed
|
||||||
|
pub fragments_removed: i64,
|
||||||
|
/// The number of new, compacted fragments added
|
||||||
|
pub fragments_added: i64,
|
||||||
|
/// The number of data files removed
|
||||||
|
pub files_removed: i64,
|
||||||
|
/// The number of new, compacted data files added
|
||||||
|
pub files_added: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Statistics about a cleanup operation
|
||||||
|
#[napi(object)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RemovalStats {
|
||||||
|
/// The number of bytes removed
|
||||||
|
pub bytes_removed: i64,
|
||||||
|
/// The number of old versions removed
|
||||||
|
pub old_versions_removed: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Statistics about an optimize operation
|
||||||
|
#[napi(object)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct OptimizeStats {
|
||||||
|
/// Statistics about the compaction operation
|
||||||
|
pub compaction: CompactionStats,
|
||||||
|
/// Statistics about the removal operation
|
||||||
|
pub prune: RemovalStats,
|
||||||
|
}
|
||||||
|
|
||||||
/// A definition of a column alteration. The alteration changes the column at
|
/// A definition of a column alteration. The alteration changes the column at
|
||||||
/// `path` to have the new name `name`, to be nullable if `nullable` is true,
|
/// `path` to have the new name `name`, to be nullable if `nullable` is true,
|
||||||
/// and to have the data type `data_type`. At least one of `rename` or `nullable`
|
/// and to have the data type `data_type`. At least one of `rename` or `nullable`
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
[bumpversion]
|
|
||||||
current_version = 0.6.13
|
|
||||||
commit = True
|
|
||||||
message = [python] Bump version: {current_version} → {new_version}
|
|
||||||
tag = True
|
|
||||||
tag_name = python-v{new_version}
|
|
||||||
|
|
||||||
[bumpversion:file:pyproject.toml]
|
|
||||||
34
python/.bumpversion.toml
Normal file
34
python/.bumpversion.toml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
[tool.bumpversion]
|
||||||
|
current_version = "0.8.0"
|
||||||
|
parse = """(?x)
|
||||||
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
(?P<patch>0|[1-9]\\d*)
|
||||||
|
(?:-(?P<pre_l>[a-zA-Z-]+)\\.(?P<pre_n>0|[1-9]\\d*))?
|
||||||
|
"""
|
||||||
|
serialize = [
|
||||||
|
"{major}.{minor}.{patch}-{pre_l}.{pre_n}",
|
||||||
|
"{major}.{minor}.{patch}",
|
||||||
|
]
|
||||||
|
search = "{current_version}"
|
||||||
|
replace = "{new_version}"
|
||||||
|
regex = false
|
||||||
|
ignore_missing_version = false
|
||||||
|
ignore_missing_files = false
|
||||||
|
tag = true
|
||||||
|
sign_tags = false
|
||||||
|
tag_name = "python-v{new_version}"
|
||||||
|
tag_message = "Bump version: {current_version} → {new_version}"
|
||||||
|
allow_dirty = true
|
||||||
|
commit = true
|
||||||
|
message = "Bump version: {current_version} → {new_version}"
|
||||||
|
commit_args = ""
|
||||||
|
|
||||||
|
[tool.bumpversion.parts.pre_l]
|
||||||
|
values = ["beta", "final"]
|
||||||
|
optional_value = "final"
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
filename = "Cargo.toml"
|
||||||
|
search = "\nversion = \"{current_version}\""
|
||||||
|
replace = "\nversion = \"{new_version}\""
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.4.10"
|
version = "0.8.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.6.13"
|
# version in Cargo.toml
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"pylance==0.10.12",
|
"pylance==0.11.0",
|
||||||
"ratelimiter~=1.0",
|
"ratelimiter~=1.0",
|
||||||
"requests>=2.31.0",
|
"requests>=2.31.0",
|
||||||
"retry>=0.9.2",
|
"retry>=0.9.2",
|
||||||
|
|||||||
@@ -86,3 +86,17 @@ class VectorQuery:
|
|||||||
def refine_factor(self, refine_factor: int): ...
|
def refine_factor(self, refine_factor: int): ...
|
||||||
def nprobes(self, nprobes: int): ...
|
def nprobes(self, nprobes: int): ...
|
||||||
def bypass_vector_index(self): ...
|
def bypass_vector_index(self): ...
|
||||||
|
|
||||||
|
class CompactionStats:
|
||||||
|
fragments_removed: int
|
||||||
|
fragments_added: int
|
||||||
|
files_removed: int
|
||||||
|
files_added: int
|
||||||
|
|
||||||
|
class RemovalStats:
|
||||||
|
bytes_removed: int
|
||||||
|
old_versions_removed: int
|
||||||
|
|
||||||
|
class OptimizeStats:
|
||||||
|
compaction: CompactionStats
|
||||||
|
prune: RemovalStats
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ if TYPE_CHECKING:
|
|||||||
import PIL
|
import PIL
|
||||||
from lance.dataset import CleanupStats, ReaderLike
|
from lance.dataset import CleanupStats, ReaderLike
|
||||||
|
|
||||||
from ._lancedb import Table as LanceDBTable
|
from ._lancedb import Table as LanceDBTable, OptimizeStats
|
||||||
from .db import LanceDBConnection
|
from .db import LanceDBConnection
|
||||||
from .index import BTree, IndexConfig, IvfPq
|
from .index import BTree, IndexConfig, IvfPq
|
||||||
|
|
||||||
@@ -2377,6 +2377,49 @@ class AsyncTable:
|
|||||||
"""
|
"""
|
||||||
await self._inner.restore()
|
await self._inner.restore()
|
||||||
|
|
||||||
|
async def optimize(
|
||||||
|
self, *, cleanup_older_than: Optional[timedelta] = None
|
||||||
|
) -> OptimizeStats:
|
||||||
|
"""
|
||||||
|
Optimize the on-disk data and indices for better performance.
|
||||||
|
|
||||||
|
Modeled after ``VACUUM`` in PostgreSQL.
|
||||||
|
|
||||||
|
Optimization covers three operations:
|
||||||
|
|
||||||
|
* Compaction: Merges small files into larger ones
|
||||||
|
* Prune: Removes old versions of the dataset
|
||||||
|
* Index: Optimizes the indices, adding new data to existing indices
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
cleanup_older_than: timedelta, optional default 7 days
|
||||||
|
All files belonging to versions older than this will be removed. Set
|
||||||
|
to 0 days to remove all versions except the latest. The latest version
|
||||||
|
is never removed.
|
||||||
|
|
||||||
|
Experimental API
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The optimization process is undergoing active development and may change.
|
||||||
|
Our goal with these changes is to improve the performance of optimization and
|
||||||
|
reduce the complexity.
|
||||||
|
|
||||||
|
That being said, it is essential today to run optimize if you want the best
|
||||||
|
performance. It should be stable and safe to use in production, but it our
|
||||||
|
hope that the API may be simplified (or not even need to be called) in the
|
||||||
|
future.
|
||||||
|
|
||||||
|
The frequency an application shoudl call optimize is based on the frequency of
|
||||||
|
data modifications. If data is frequently added, deleted, or updated then
|
||||||
|
optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||||
|
you have added or modified 100,000 or more records or run more than 20 data
|
||||||
|
modification operations.
|
||||||
|
"""
|
||||||
|
if cleanup_older_than is not None:
|
||||||
|
cleanup_older_than = round(cleanup_older_than.total_seconds() * 1000)
|
||||||
|
return await self._inner.optimize(cleanup_older_than)
|
||||||
|
|
||||||
async def list_indices(self) -> IndexConfig:
|
async def list_indices(self) -> IndexConfig:
|
||||||
"""
|
"""
|
||||||
List all indices that have been created with Self::create_index
|
List all indices that have been created with Self::create_index
|
||||||
|
|||||||
@@ -1025,3 +1025,29 @@ async def test_time_travel(db_async: AsyncConnection):
|
|||||||
# Can't use restore if not checked out
|
# Can't use restore if not checked out
|
||||||
with pytest.raises(ValueError, match="checkout before running restore"):
|
with pytest.raises(ValueError, match="checkout before running restore"):
|
||||||
await table.restore()
|
await table.restore()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_optimize(db_async: AsyncConnection):
|
||||||
|
table = await db_async.create_table(
|
||||||
|
"test",
|
||||||
|
data=[{"x": [1]}],
|
||||||
|
)
|
||||||
|
await table.add(
|
||||||
|
data=[
|
||||||
|
{"x": [2]},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
stats = await table.optimize()
|
||||||
|
assert stats.compaction.files_removed == 2
|
||||||
|
assert stats.compaction.files_added == 1
|
||||||
|
assert stats.compaction.fragments_added == 1
|
||||||
|
assert stats.compaction.fragments_removed == 2
|
||||||
|
assert stats.prune.bytes_removed == 0
|
||||||
|
assert stats.prune.old_versions_removed == 0
|
||||||
|
|
||||||
|
stats = await table.optimize(cleanup_older_than=timedelta(seconds=0))
|
||||||
|
assert stats.prune.bytes_removed > 0
|
||||||
|
assert stats.prune.old_versions_removed == 3
|
||||||
|
|
||||||
|
assert await table.query().to_arrow() == pa.table({"x": [[1], [2]]})
|
||||||
|
|||||||
@@ -2,7 +2,9 @@ use arrow::{
|
|||||||
ffi_stream::ArrowArrayStreamReader,
|
ffi_stream::ArrowArrayStreamReader,
|
||||||
pyarrow::{FromPyArrow, ToPyArrow},
|
pyarrow::{FromPyArrow, ToPyArrow},
|
||||||
};
|
};
|
||||||
use lancedb::table::{AddDataMode, Table as LanceDbTable};
|
use lancedb::table::{
|
||||||
|
AddDataMode, Duration, OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
||||||
|
};
|
||||||
use pyo3::{
|
use pyo3::{
|
||||||
exceptions::{PyRuntimeError, PyValueError},
|
exceptions::{PyRuntimeError, PyValueError},
|
||||||
pyclass, pymethods,
|
pyclass, pymethods,
|
||||||
@@ -17,6 +19,40 @@ use crate::{
|
|||||||
query::Query,
|
query::Query,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Statistics about a compaction operation.
|
||||||
|
#[pyclass(get_all)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct CompactionStats {
|
||||||
|
/// The number of fragments removed
|
||||||
|
pub fragments_removed: u64,
|
||||||
|
/// The number of new, compacted fragments added
|
||||||
|
pub fragments_added: u64,
|
||||||
|
/// The number of data files removed
|
||||||
|
pub files_removed: u64,
|
||||||
|
/// The number of new, compacted data files added
|
||||||
|
pub files_added: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Statistics about a cleanup operation
|
||||||
|
#[pyclass(get_all)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RemovalStats {
|
||||||
|
/// The number of bytes removed
|
||||||
|
pub bytes_removed: u64,
|
||||||
|
/// The number of old versions removed
|
||||||
|
pub old_versions_removed: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Statistics about an optimize operation
|
||||||
|
#[pyclass(get_all)]
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct OptimizeStats {
|
||||||
|
/// Statistics about the compaction operation
|
||||||
|
pub compaction: CompactionStats,
|
||||||
|
/// Statistics about the removal operation
|
||||||
|
pub prune: RemovalStats,
|
||||||
|
}
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct Table {
|
pub struct Table {
|
||||||
// We keep a copy of the name to use if the inner table is dropped
|
// We keep a copy of the name to use if the inner table is dropped
|
||||||
@@ -191,4 +227,58 @@ impl Table {
|
|||||||
pub fn query(&self) -> Query {
|
pub fn query(&self) -> Query {
|
||||||
Query::new(self.inner_ref().unwrap().query())
|
Query::new(self.inner_ref().unwrap().query())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn optimize(self_: PyRef<'_, Self>, cleanup_since_ms: Option<u64>) -> PyResult<&PyAny> {
|
||||||
|
let inner = self_.inner_ref()?.clone();
|
||||||
|
let older_than = if let Some(ms) = cleanup_since_ms {
|
||||||
|
if ms > i64::MAX as u64 {
|
||||||
|
return Err(PyValueError::new_err(format!(
|
||||||
|
"cleanup_since_ms must be between {} and -{}",
|
||||||
|
i32::MAX,
|
||||||
|
i32::MAX
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
Duration::try_milliseconds(ms as i64)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
future_into_py(self_.py(), async move {
|
||||||
|
let compaction_stats = inner
|
||||||
|
.optimize(OptimizeAction::Compact {
|
||||||
|
options: lancedb::table::CompactionOptions::default(),
|
||||||
|
remap_options: None,
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.infer_error()?
|
||||||
|
.compaction
|
||||||
|
.unwrap();
|
||||||
|
let prune_stats = inner
|
||||||
|
.optimize(OptimizeAction::Prune {
|
||||||
|
older_than,
|
||||||
|
delete_unverified: None,
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.infer_error()?
|
||||||
|
.prune
|
||||||
|
.unwrap();
|
||||||
|
inner
|
||||||
|
.optimize(lancedb::table::OptimizeAction::Index(
|
||||||
|
OptimizeOptions::default(),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
.infer_error()?;
|
||||||
|
Ok(OptimizeStats {
|
||||||
|
compaction: CompactionStats {
|
||||||
|
files_added: compaction_stats.files_added as u64,
|
||||||
|
files_removed: compaction_stats.files_removed as u64,
|
||||||
|
fragments_added: compaction_stats.fragments_added as u64,
|
||||||
|
fragments_removed: compaction_stats.fragments_removed as u64,
|
||||||
|
},
|
||||||
|
prune: RemovalStats {
|
||||||
|
bytes_removed: prune_stats.bytes_removed,
|
||||||
|
old_versions_removed: prune_stats.old_versions,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,6 +8,51 @@ The Python package is versioned and released separately from the Rust and Node.j
|
|||||||
ones. For Rust and Node.js, the release process is shared between `lancedb` and
|
ones. For Rust and Node.js, the release process is shared between `lancedb` and
|
||||||
`vectordb` for now.
|
`vectordb` for now.
|
||||||
|
|
||||||
|
## Preview releases
|
||||||
|
|
||||||
|
LanceDB has full releases about every 2 weeks, but in between we make frequent
|
||||||
|
preview releases. These are released as `0.x.y.betaN` versions. They receive the
|
||||||
|
same level of testing as normal releases and let you get access to the latest
|
||||||
|
features. However, we do not guarantee that preview releases will be available
|
||||||
|
more than 6 months after they are released. We may delete the preview releases
|
||||||
|
from the packaging index after a while. Once your application is stable, we
|
||||||
|
recommend switching to full releases, which will never be removed from package
|
||||||
|
indexes.
|
||||||
|
|
||||||
|
## Making releases
|
||||||
|
|
||||||
|
The release process uses a handful of GitHub actions to automate the process.
|
||||||
|
|
||||||
|
```text
|
||||||
|
┌─────────────────────┐
|
||||||
|
│Create Release Commit│
|
||||||
|
└─┬───────────────────┘
|
||||||
|
│ ┌────────────┐ ┌──►Python GH Release
|
||||||
|
├──►(tag) python-vX.Y.Z ───►│PyPI Publish├─┤
|
||||||
|
│ └────────────┘ └──►Python Wheels
|
||||||
|
│
|
||||||
|
│ ┌───────────┐
|
||||||
|
└──►(tag) vX.Y.Z ───┬──────►│NPM Publish├──┬──►Rust/Node GH Release
|
||||||
|
│ └───────────┘ │
|
||||||
|
│ └──►NPM Packages
|
||||||
|
│ ┌─────────────┐
|
||||||
|
└──────►│Cargo Publish├───►Cargo Release
|
||||||
|
└─────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
To start a release, trigger a `Create Release Commit` action from
|
||||||
|
[the workflows page](https://github.com/lancedb/lancedb/actions/workflows/make-release-commit.yml)
|
||||||
|
(Click on "Run workflow").
|
||||||
|
|
||||||
|
* **For a preview release**, leave the default parameters.
|
||||||
|
* **For a stable release**, set the `release_type` input to `stable`.
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
> If there was a breaking change since the last stable release, and we haven't
|
||||||
|
> done so yet, we should increment the minor version. The CI will detect if this
|
||||||
|
> is needed and fail the `Create Release Commit` job. To fix, select the
|
||||||
|
> "bump minor version" option.
|
||||||
|
|
||||||
## Breaking changes
|
## Breaking changes
|
||||||
|
|
||||||
We try to avoid breaking changes, but sometimes they are necessary. When there
|
We try to avoid breaking changes, but sometimes they are necessary. When there
|
||||||
@@ -21,12 +66,10 @@ body of the PR. A CI job will add a `breaking-change` label to the PR, which is
|
|||||||
what will ultimately be used to CI to determine if the minor version should be
|
what will ultimately be used to CI to determine if the minor version should be
|
||||||
incremented.
|
incremented.
|
||||||
|
|
||||||
A CI job will validate that if a `breaking-change` label is added, the minor
|
> [!IMPORTANT]
|
||||||
version is incremented in the `Cargo.toml` and `pyproject.toml` files. The only
|
> Reviewers should check that PRs with breaking changes receive the `breaking-change`
|
||||||
exception is if it has already been incremented since the last stable release.
|
> label. If a PR is missing the label, please add it, even if after it was merged.
|
||||||
|
> This label is used in the release process.
|
||||||
**It is the responsibility of the PR author to increment the minor version when
|
|
||||||
appropriate.**
|
|
||||||
|
|
||||||
Some things that are considered breaking changes:
|
Some things that are considered breaking changes:
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.4.20"
|
version = "0.5.0"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -19,10 +19,12 @@ use snafu::Snafu;
|
|||||||
|
|
||||||
#[derive(Debug, Snafu)]
|
#[derive(Debug, Snafu)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
|
#[allow(dead_code)]
|
||||||
#[snafu(display("column '{name}' is missing"))]
|
#[snafu(display("column '{name}' is missing"))]
|
||||||
MissingColumn { name: String },
|
MissingColumn { name: String },
|
||||||
#[snafu(display("{name}: {message}"))]
|
#[snafu(display("{name}: {message}"))]
|
||||||
OutOfRange { name: String, message: String },
|
OutOfRange { name: String, message: String },
|
||||||
|
#[allow(dead_code)]
|
||||||
#[snafu(display("{index_type} is not a valid index type"))]
|
#[snafu(display("{index_type} is not a valid index type"))]
|
||||||
InvalidIndexType { index_type: String },
|
InvalidIndexType { index_type: String },
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ use neon::prelude::*;
|
|||||||
pub trait JsObjectExt {
|
pub trait JsObjectExt {
|
||||||
fn get_opt_u32(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<u32>>;
|
fn get_opt_u32(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<u32>>;
|
||||||
fn get_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<usize>;
|
fn get_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<usize>;
|
||||||
|
#[allow(dead_code)]
|
||||||
fn get_opt_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<usize>>;
|
fn get_opt_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<usize>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -324,7 +324,7 @@ impl JsTable {
|
|||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
let stats = table
|
let stats = table
|
||||||
.optimize(OptimizeAction::Prune {
|
.optimize(OptimizeAction::Prune {
|
||||||
older_than,
|
older_than: Some(older_than),
|
||||||
delete_unverified,
|
delete_unverified,
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.4.20"
|
version = "0.5.0"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -40,8 +40,8 @@ serde = { version = "^1" }
|
|||||||
serde_json = { version = "1" }
|
serde_json = { version = "1" }
|
||||||
# For remote feature
|
# For remote feature
|
||||||
reqwest = { version = "0.11.24", features = ["gzip", "json"], optional = true }
|
reqwest = { version = "0.11.24", features = ["gzip", "json"], optional = true }
|
||||||
polars-arrow = { version = ">=0.37", optional = true }
|
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }
|
||||||
polars = { version = ">=0.37", optional = true}
|
polars = { version = ">=0.37,<0.40.0", optional = true}
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3.5.0"
|
tempfile = "3.5.0"
|
||||||
@@ -49,9 +49,12 @@ rand = { version = "0.8.3", features = ["small_rng"] }
|
|||||||
uuid = { version = "1.7.0", features = ["v4"] }
|
uuid = { version = "1.7.0", features = ["v4"] }
|
||||||
walkdir = "2"
|
walkdir = "2"
|
||||||
# For s3 integration tests (dev deps aren't allowed to be optional atm)
|
# For s3 integration tests (dev deps aren't allowed to be optional atm)
|
||||||
aws-sdk-s3 = { version = "1.0" }
|
# We pin these because the content-length check breaks with localstack
|
||||||
aws-sdk-kms = { version = "1.0" }
|
# https://github.com/smithy-lang/smithy-rs/releases/tag/release-2024-05-21
|
||||||
|
aws-sdk-s3 = { version = "=1.23.0" }
|
||||||
|
aws-sdk-kms = { version = "=1.21.0" }
|
||||||
aws-config = { version = "1.0" }
|
aws-config = { version = "1.0" }
|
||||||
|
aws-smithy-runtime = { version = "=1.3.0" }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
|
|||||||
@@ -195,7 +195,7 @@ impl<T: IntoArrow> CreateTableBuilder<true, T> {
|
|||||||
.embedding_registry()
|
.embedding_registry()
|
||||||
.get(&definition.embedding_name)
|
.get(&definition.embedding_name)
|
||||||
.ok_or_else(|| Error::EmbeddingFunctionNotFound {
|
.ok_or_else(|| Error::EmbeddingFunctionNotFound {
|
||||||
name: definition.embedding_name.to_string(),
|
name: definition.embedding_name.clone(),
|
||||||
reason: "No embedding function found in the connection's embedding_registry"
|
reason: "No embedding function found in the connection's embedding_registry"
|
||||||
.to_string(),
|
.to_string(),
|
||||||
})?;
|
})?;
|
||||||
|
|||||||
@@ -155,7 +155,7 @@ impl<R: RecordBatchReader> MaybeEmbedded<R> {
|
|||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
return Err(Error::EmbeddingFunctionNotFound {
|
return Err(Error::EmbeddingFunctionNotFound {
|
||||||
name: embedding_def.embedding_name.to_string(),
|
name: embedding_def.embedding_name.clone(),
|
||||||
reason: format!(
|
reason: format!(
|
||||||
"Table was defined with an embedding column `{}` but no embedding function was found with that name within the registry.",
|
"Table was defined with an embedding column `{}` but no embedding function was found with that name within the registry.",
|
||||||
embedding_def.embedding_name
|
embedding_def.embedding_name
|
||||||
|
|||||||
@@ -16,7 +16,10 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use crate::{table::TableInternal, Result};
|
use crate::{table::TableInternal, Result};
|
||||||
|
|
||||||
use self::{scalar::BTreeIndexBuilder, vector::IvfPqIndexBuilder};
|
use self::{
|
||||||
|
scalar::BTreeIndexBuilder,
|
||||||
|
vector::{IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
|
||||||
|
};
|
||||||
|
|
||||||
pub mod scalar;
|
pub mod scalar;
|
||||||
pub mod vector;
|
pub mod vector;
|
||||||
@@ -25,6 +28,7 @@ pub enum Index {
|
|||||||
Auto,
|
Auto,
|
||||||
BTree(BTreeIndexBuilder),
|
BTree(BTreeIndexBuilder),
|
||||||
IvfPq(IvfPqIndexBuilder),
|
IvfPq(IvfPqIndexBuilder),
|
||||||
|
IvfHnswSq(IvfHnswSqIndexBuilder),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Builder for the create_index operation
|
/// Builder for the create_index operation
|
||||||
@@ -65,6 +69,7 @@ impl IndexBuilder {
|
|||||||
#[derive(Debug, Clone, PartialEq)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
pub enum IndexType {
|
pub enum IndexType {
|
||||||
IvfPq,
|
IvfPq,
|
||||||
|
IvfHnswSq,
|
||||||
BTree,
|
BTree,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -83,10 +83,14 @@ pub struct VectorIndexStatistics {
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct IvfPqIndexBuilder {
|
pub struct IvfPqIndexBuilder {
|
||||||
pub(crate) distance_type: DistanceType,
|
pub(crate) distance_type: DistanceType,
|
||||||
|
|
||||||
|
// IVF
|
||||||
pub(crate) num_partitions: Option<u32>,
|
pub(crate) num_partitions: Option<u32>,
|
||||||
pub(crate) num_sub_vectors: Option<u32>,
|
|
||||||
pub(crate) sample_rate: u32,
|
pub(crate) sample_rate: u32,
|
||||||
pub(crate) max_iterations: u32,
|
pub(crate) max_iterations: u32,
|
||||||
|
|
||||||
|
// PQ
|
||||||
|
pub(crate) num_sub_vectors: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for IvfPqIndexBuilder {
|
impl Default for IvfPqIndexBuilder {
|
||||||
@@ -201,3 +205,124 @@ pub(crate) fn suggested_num_sub_vectors(dim: u32) -> u32 {
|
|||||||
1
|
1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Builder for an IVF_HNSW_SQ index.
|
||||||
|
///
|
||||||
|
/// This index is a combination of IVF and HNSW.
|
||||||
|
/// The IVF part is the same as the IVF PQ index.
|
||||||
|
/// For each IVF partition, this builds a HNSW graph, the graph is used to
|
||||||
|
/// quickly find the closest vectors to a query vector.
|
||||||
|
///
|
||||||
|
/// The SQ (scalar quantizer) is used to compress the vectors,
|
||||||
|
/// each vector is mapped to a 8-bit integer vector, 4x compression ratio for float32 vector.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct IvfHnswSqIndexBuilder {
|
||||||
|
// IVF
|
||||||
|
pub(crate) distance_type: DistanceType,
|
||||||
|
pub(crate) num_partitions: Option<u32>,
|
||||||
|
pub(crate) sample_rate: u32,
|
||||||
|
pub(crate) max_iterations: u32,
|
||||||
|
|
||||||
|
// HNSW
|
||||||
|
pub(crate) m: u32,
|
||||||
|
pub(crate) ef_construction: u32,
|
||||||
|
// SQ
|
||||||
|
// TODO add num_bits for SQ after it supports another num_bits besides 8
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for IvfHnswSqIndexBuilder {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
distance_type: DistanceType::L2,
|
||||||
|
num_partitions: None,
|
||||||
|
sample_rate: 256,
|
||||||
|
max_iterations: 50,
|
||||||
|
m: 20,
|
||||||
|
ef_construction: 300,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl IvfHnswSqIndexBuilder {
|
||||||
|
/// [DistanceType] to use to build the index.
|
||||||
|
///
|
||||||
|
/// Default value is [DistanceType::L2].
|
||||||
|
///
|
||||||
|
/// This is used when training the index to calculate the IVF partitions (vectors are
|
||||||
|
/// grouped in partitions with similar vectors according to this distance type)
|
||||||
|
///
|
||||||
|
/// The metric type used to train an index MUST match the metric type used to search the
|
||||||
|
/// index. Failure to do so will yield inaccurate results.
|
||||||
|
///
|
||||||
|
/// Now IVF_HNSW_SQ only supports L2 and Cosine distance types.
|
||||||
|
pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
|
||||||
|
self.distance_type = distance_type;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The number of IVF partitions to create.
|
||||||
|
///
|
||||||
|
/// This value should generally scale with the number of rows in the dataset. By default
|
||||||
|
/// the number of partitions is the square root of the number of rows.
|
||||||
|
///
|
||||||
|
/// If this value is too large then the first part of the search (picking the right partition)
|
||||||
|
/// will be slow. If this value is too small then the second part of the search (searching
|
||||||
|
/// within a partition) will be slow.
|
||||||
|
pub fn num_partitions(mut self, num_partitions: u32) -> Self {
|
||||||
|
self.num_partitions = Some(num_partitions);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The rate used to calculate the number of training vectors for kmeans and SQ.
|
||||||
|
///
|
||||||
|
/// When an IVF_HNSW_SQ index is trained, we need to calculate partitions and min/max value of vectors. These are groups
|
||||||
|
/// of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
||||||
|
///
|
||||||
|
/// Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
||||||
|
/// random sample of the data. This parameter controls the size of the sample. The total
|
||||||
|
/// number of vectors used to train the IVF is `sample_rate * num_partitions`.
|
||||||
|
///
|
||||||
|
/// The total number of vectors used to train the SQ is `sample_rate * 2^{num_bits}`.
|
||||||
|
///
|
||||||
|
/// Increasing this value might improve the quality of the index but in most cases the
|
||||||
|
/// default should be sufficient.
|
||||||
|
///
|
||||||
|
/// The default value is 256.
|
||||||
|
pub fn sample_rate(mut self, sample_rate: u32) -> Self {
|
||||||
|
self.sample_rate = sample_rate;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Max iterations to train kmeans.
|
||||||
|
///
|
||||||
|
/// When training an IVF index we use kmeans to calculate the partitions. This parameter
|
||||||
|
/// controls how many iterations of kmeans to run.
|
||||||
|
///
|
||||||
|
/// Increasing this might improve the quality of the index but in most cases the parameter
|
||||||
|
/// is unused because kmeans will converge with fewer iterations. The parameter is only
|
||||||
|
/// used in cases where kmeans does not appear to converge. In those cases it is unlikely
|
||||||
|
/// that setting this larger will lead to the index converging anyways.
|
||||||
|
///
|
||||||
|
/// The default value is 50.
|
||||||
|
pub fn max_iterations(mut self, max_iterations: u32) -> Self {
|
||||||
|
self.max_iterations = max_iterations;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The number of neighbors to select for each vector in the HNSW graph.
|
||||||
|
/// Bumping this number will increase the recall of the search but also increase the build/search time.
|
||||||
|
/// The default value is 20.
|
||||||
|
pub fn m(mut self, m: u32) -> Self {
|
||||||
|
self.m = m;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The number of candidates to evaluate during the construction of the HNSW graph.
|
||||||
|
/// Bumping this number will increase the recall of the search but also increase the build/search time.
|
||||||
|
/// This value should be not less than `ef` in the search phase.
|
||||||
|
/// The default value is 300.
|
||||||
|
pub fn ef_construction(mut self, ef_construction: u32) -> Self {
|
||||||
|
self.ef_construction = ef_construction;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -23,12 +23,9 @@ use arrow::datatypes::Float32Type;
|
|||||||
use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
||||||
use arrow_schema::{DataType, Field, Schema, SchemaRef};
|
use arrow_schema::{DataType, Field, Schema, SchemaRef};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use chrono::Duration;
|
|
||||||
use lance::dataset::builder::DatasetBuilder;
|
use lance::dataset::builder::DatasetBuilder;
|
||||||
use lance::dataset::cleanup::RemovalStats;
|
use lance::dataset::cleanup::RemovalStats;
|
||||||
use lance::dataset::optimize::{
|
use lance::dataset::optimize::{compact_files, CompactionMetrics, IndexRemapperOptions};
|
||||||
compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions,
|
|
||||||
};
|
|
||||||
use lance::dataset::scanner::{DatasetRecordBatchStream, Scanner};
|
use lance::dataset::scanner::{DatasetRecordBatchStream, Scanner};
|
||||||
pub use lance::dataset::ColumnAlteration;
|
pub use lance::dataset::ColumnAlteration;
|
||||||
pub use lance::dataset::NewColumnTransform;
|
pub use lance::dataset::NewColumnTransform;
|
||||||
@@ -38,8 +35,11 @@ use lance::dataset::{
|
|||||||
};
|
};
|
||||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||||
use lance::io::WrappingObjectStore;
|
use lance::io::WrappingObjectStore;
|
||||||
|
use lance_index::vector::hnsw::builder::HnswBuildParams;
|
||||||
|
use lance_index::vector::ivf::IvfBuildParams;
|
||||||
|
use lance_index::vector::sq::builder::SQBuildParams;
|
||||||
|
use lance_index::DatasetIndexExt;
|
||||||
use lance_index::IndexType;
|
use lance_index::IndexType;
|
||||||
use lance_index::{optimize::OptimizeOptions, DatasetIndexExt};
|
|
||||||
use log::info;
|
use log::info;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use snafu::whatever;
|
use snafu::whatever;
|
||||||
@@ -48,7 +48,9 @@ use crate::arrow::IntoArrow;
|
|||||||
use crate::connection::NoData;
|
use crate::connection::NoData;
|
||||||
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
||||||
use crate::error::{Error, Result};
|
use crate::error::{Error, Result};
|
||||||
use crate::index::vector::{IvfPqIndexBuilder, VectorIndex, VectorIndexStatistics};
|
use crate::index::vector::{
|
||||||
|
IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex, VectorIndexStatistics,
|
||||||
|
};
|
||||||
use crate::index::IndexConfig;
|
use crate::index::IndexConfig;
|
||||||
use crate::index::{
|
use crate::index::{
|
||||||
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
||||||
@@ -65,6 +67,10 @@ use self::merge::MergeInsertBuilder;
|
|||||||
pub(crate) mod dataset;
|
pub(crate) mod dataset;
|
||||||
pub mod merge;
|
pub mod merge;
|
||||||
|
|
||||||
|
pub use chrono::Duration;
|
||||||
|
pub use lance::dataset::optimize::CompactionOptions;
|
||||||
|
pub use lance_index::optimize::OptimizeOptions;
|
||||||
|
|
||||||
/// Defines the type of column
|
/// Defines the type of column
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub enum ColumnKind {
|
pub enum ColumnKind {
|
||||||
@@ -145,22 +151,58 @@ impl TableDefinition {
|
|||||||
///
|
///
|
||||||
/// By default, it optimizes everything, as [`OptimizeAction::All`].
|
/// By default, it optimizes everything, as [`OptimizeAction::All`].
|
||||||
pub enum OptimizeAction {
|
pub enum OptimizeAction {
|
||||||
/// Run optimization on every, with default options.
|
/// Run all optimizations with default values
|
||||||
All,
|
All,
|
||||||
/// Compact files in the dataset
|
/// Compacts files in the dataset
|
||||||
|
///
|
||||||
|
/// LanceDb uses a readonly filesystem for performance and safe concurrency. Every time
|
||||||
|
/// new data is added it will be added into new files. Small files
|
||||||
|
/// can hurt both read and write performance. Compaction will merge small files
|
||||||
|
/// into larger ones.
|
||||||
|
///
|
||||||
|
/// All operations that modify data (add, delete, update, merge insert, etc.) will create
|
||||||
|
/// new files. If these operations are run frequently then compaction should run frequently.
|
||||||
|
///
|
||||||
|
/// If these operations are never run (search only) then compaction is not necessary.
|
||||||
Compact {
|
Compact {
|
||||||
options: CompactionOptions,
|
options: CompactionOptions,
|
||||||
remap_options: Option<Arc<dyn IndexRemapperOptions>>,
|
remap_options: Option<Arc<dyn IndexRemapperOptions>>,
|
||||||
},
|
},
|
||||||
/// Prune old version of datasets.
|
/// Prune old version of datasets
|
||||||
|
///
|
||||||
|
/// Every change in LanceDb is additive. When data is removed from a dataset a new version is
|
||||||
|
/// created that doesn't contain the removed data. However, the old version, which does contain
|
||||||
|
/// the removed data, is left in place. This is necessary for consistency and concurrency and
|
||||||
|
/// also enables time travel functionality like the ability to checkout an older version of the
|
||||||
|
/// dataset to undo changes.
|
||||||
|
///
|
||||||
|
/// Over time, these old versions can consume a lot of disk space. The prune operation will
|
||||||
|
/// remove versions of the dataset that are older than a certain age. This will free up the
|
||||||
|
/// space used by that old data.
|
||||||
|
///
|
||||||
|
/// Once a version is pruned it can no longer be checked out.
|
||||||
Prune {
|
Prune {
|
||||||
/// The duration of time to keep versions of the dataset.
|
/// The duration of time to keep versions of the dataset.
|
||||||
older_than: Duration,
|
older_than: Option<Duration>,
|
||||||
/// Because they may be part of an in-progress transaction, files newer than 7 days old are not deleted by default.
|
/// Because they may be part of an in-progress transaction, files newer than 7 days old are not deleted by default.
|
||||||
/// If you are sure that there are no in-progress transactions, then you can set this to True to delete all files older than `older_than`.
|
/// If you are sure that there are no in-progress transactions, then you can set this to True to delete all files older than `older_than`.
|
||||||
delete_unverified: Option<bool>,
|
delete_unverified: Option<bool>,
|
||||||
},
|
},
|
||||||
/// Optimize index.
|
/// Optimize the indices
|
||||||
|
///
|
||||||
|
/// This operation optimizes all indices in the table. When new data is added to LanceDb
|
||||||
|
/// it is not added to the indices. However, it can still turn up in searches because the search
|
||||||
|
/// function will scan both the indexed data and the unindexed data in parallel. Over time, the
|
||||||
|
/// unindexed data can become large enough that the search performance is slow. This operation
|
||||||
|
/// will add the unindexed data to the indices without rerunning the full index creation process.
|
||||||
|
///
|
||||||
|
/// Optimizing an index is faster than re-training the index but it does not typically adjust the
|
||||||
|
/// underlying model relied upon by the index. This can eventually lead to poor search accuracy
|
||||||
|
/// and so users may still want to occasionally retrain the index after adding a large amount of
|
||||||
|
/// data.
|
||||||
|
///
|
||||||
|
/// For example, when using IVF, an index will create clusters. Optimizing an index assigns unindexed
|
||||||
|
/// data to the existing clusters, but it does not move the clusters or create new clusters.
|
||||||
Index(OptimizeOptions),
|
Index(OptimizeOptions),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -312,6 +354,7 @@ impl UpdateBuilder {
|
|||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||||
|
#[allow(dead_code)]
|
||||||
fn as_any(&self) -> &dyn std::any::Any;
|
fn as_any(&self) -> &dyn std::any::Any;
|
||||||
/// Cast as [`NativeTable`], or return None it if is not a [`NativeTable`].
|
/// Cast as [`NativeTable`], or return None it if is not a [`NativeTable`].
|
||||||
fn as_native(&self) -> Option<&NativeTable>;
|
fn as_native(&self) -> Option<&NativeTable>;
|
||||||
@@ -751,10 +794,30 @@ impl Table {
|
|||||||
|
|
||||||
/// Optimize the on-disk data and indices for better performance.
|
/// Optimize the on-disk data and indices for better performance.
|
||||||
///
|
///
|
||||||
|
/// Modeled after ``VACUUM`` in PostgreSQL.
|
||||||
|
///
|
||||||
|
/// Optimization is discussed in more detail in the [OptimizeAction] documentation
|
||||||
|
/// and covers three operations:
|
||||||
|
///
|
||||||
|
/// * Compaction: Merges small files into larger ones
|
||||||
|
/// * Prune: Removes old versions of the dataset
|
||||||
|
/// * Index: Optimizes the indices, adding new data to existing indices
|
||||||
|
///
|
||||||
/// <section class="warning">Experimental API</section>
|
/// <section class="warning">Experimental API</section>
|
||||||
///
|
///
|
||||||
/// Modeled after ``VACUUM`` in PostgreSQL.
|
/// The optimization process is undergoing active development and may change.
|
||||||
/// Not all implementations support explicit optimization.
|
/// Our goal with these changes is to improve the performance of optimization and
|
||||||
|
/// reduce the complexity.
|
||||||
|
///
|
||||||
|
/// That being said, it is essential today to run optimize if you want the best
|
||||||
|
/// performance. It should be stable and safe to use in production, but it our
|
||||||
|
/// hope that the API may be simplified (or not even need to be called) in the future.
|
||||||
|
///
|
||||||
|
/// The frequency an application shoudl call optimize is based on the frequency of
|
||||||
|
/// data modifications. If data is frequently added, deleted, or updated then
|
||||||
|
/// optimize should be run frequently. A good rule of thumb is to run optimize if
|
||||||
|
/// you have added or modified 100,000 or more records or run more than 20 data
|
||||||
|
/// modification operations.
|
||||||
pub async fn optimize(&self, action: OptimizeAction) -> Result<OptimizeStats> {
|
pub async fn optimize(&self, action: OptimizeAction) -> Result<OptimizeStats> {
|
||||||
self.inner.optimize(action).await
|
self.inner.optimize(action).await
|
||||||
}
|
}
|
||||||
@@ -1238,7 +1301,6 @@ impl NativeTable {
|
|||||||
num_partitions as usize,
|
num_partitions as usize,
|
||||||
/*num_bits=*/ 8,
|
/*num_bits=*/ 8,
|
||||||
num_sub_vectors as usize,
|
num_sub_vectors as usize,
|
||||||
false,
|
|
||||||
index.distance_type.into(),
|
index.distance_type.into(),
|
||||||
index.max_iterations as usize,
|
index.max_iterations as usize,
|
||||||
);
|
);
|
||||||
@@ -1254,6 +1316,57 @@ impl NativeTable {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn create_ivf_hnsw_sq_index(
|
||||||
|
&self,
|
||||||
|
index: IvfHnswSqIndexBuilder,
|
||||||
|
field: &Field,
|
||||||
|
replace: bool,
|
||||||
|
) -> Result<()> {
|
||||||
|
if !Self::supported_vector_data_type(field.data_type()) {
|
||||||
|
return Err(Error::InvalidInput {
|
||||||
|
message: format!(
|
||||||
|
"An IVF HNSW SQ index cannot be created on the column `{}` which has data type {}",
|
||||||
|
field.name(),
|
||||||
|
field.data_type()
|
||||||
|
),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let num_partitions = if let Some(n) = index.num_partitions {
|
||||||
|
n
|
||||||
|
} else {
|
||||||
|
suggested_num_partitions(self.count_rows(None).await?)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut dataset = self.dataset.get_mut().await?;
|
||||||
|
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
||||||
|
ivf_params.sample_rate = index.sample_rate as usize;
|
||||||
|
ivf_params.max_iters = index.max_iterations as usize;
|
||||||
|
let hnsw_params = HnswBuildParams::default()
|
||||||
|
.num_edges(index.m as usize)
|
||||||
|
.ef_construction(index.ef_construction as usize);
|
||||||
|
let sq_params = SQBuildParams {
|
||||||
|
sample_rate: index.sample_rate as usize,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_sq_params(
|
||||||
|
index.distance_type.into(),
|
||||||
|
ivf_params,
|
||||||
|
hnsw_params,
|
||||||
|
sq_params,
|
||||||
|
);
|
||||||
|
dataset
|
||||||
|
.create_index(
|
||||||
|
&[field.name()],
|
||||||
|
IndexType::Vector,
|
||||||
|
None,
|
||||||
|
&lance_idx_params,
|
||||||
|
replace,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
async fn create_auto_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
async fn create_auto_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||||
if Self::supported_vector_data_type(field.data_type()) {
|
if Self::supported_vector_data_type(field.data_type()) {
|
||||||
self.create_ivf_pq_index(IvfPqIndexBuilder::default(), field, opts.replace)
|
self.create_ivf_pq_index(IvfPqIndexBuilder::default(), field, opts.replace)
|
||||||
@@ -1497,6 +1610,10 @@ impl TableInternal for NativeTable {
|
|||||||
Index::Auto => self.create_auto_index(field, opts).await,
|
Index::Auto => self.create_auto_index(field, opts).await,
|
||||||
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
||||||
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
||||||
|
Index::IvfHnswSq(ivf_hnsw_sq) => {
|
||||||
|
self.create_ivf_hnsw_sq_index(ivf_hnsw_sq, field, opts.replace)
|
||||||
|
.await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1592,7 +1709,7 @@ impl TableInternal for NativeTable {
|
|||||||
.compaction;
|
.compaction;
|
||||||
stats.prune = self
|
stats.prune = self
|
||||||
.optimize(OptimizeAction::Prune {
|
.optimize(OptimizeAction::Prune {
|
||||||
older_than: Duration::try_days(7).unwrap(),
|
older_than: None,
|
||||||
delete_unverified: None,
|
delete_unverified: None,
|
||||||
})
|
})
|
||||||
.await?
|
.await?
|
||||||
@@ -1611,8 +1728,11 @@ impl TableInternal for NativeTable {
|
|||||||
delete_unverified,
|
delete_unverified,
|
||||||
} => {
|
} => {
|
||||||
stats.prune = Some(
|
stats.prune = Some(
|
||||||
self.cleanup_old_versions(older_than, delete_unverified)
|
self.cleanup_old_versions(
|
||||||
.await?,
|
older_than.unwrap_or(Duration::try_days(7).expect("valid delta")),
|
||||||
|
delete_unverified,
|
||||||
|
)
|
||||||
|
.await?,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
OptimizeAction::Index(options) => {
|
OptimizeAction::Index(options) => {
|
||||||
@@ -2357,6 +2477,102 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_create_index_ivf_hnsw_sq() {
|
||||||
|
use arrow_array::RecordBatch;
|
||||||
|
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
|
||||||
|
use rand;
|
||||||
|
use std::iter::repeat_with;
|
||||||
|
|
||||||
|
use arrow_array::Float32Array;
|
||||||
|
|
||||||
|
let tmp_dir = tempdir().unwrap();
|
||||||
|
let uri = tmp_dir.path().to_str().unwrap();
|
||||||
|
let conn = connect(uri).execute().await.unwrap();
|
||||||
|
|
||||||
|
let dimension = 16;
|
||||||
|
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
|
||||||
|
"embeddings",
|
||||||
|
DataType::FixedSizeList(
|
||||||
|
Arc::new(Field::new("item", DataType::Float32, true)),
|
||||||
|
dimension,
|
||||||
|
),
|
||||||
|
false,
|
||||||
|
)]));
|
||||||
|
|
||||||
|
let mut rng = rand::thread_rng();
|
||||||
|
let float_arr = Float32Array::from(
|
||||||
|
repeat_with(|| rng.gen::<f32>())
|
||||||
|
.take(512 * dimension as usize)
|
||||||
|
.collect::<Vec<f32>>(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
|
||||||
|
let batches = RecordBatchIterator::new(
|
||||||
|
vec![RecordBatch::try_new(schema.clone(), vec![vectors.clone()]).unwrap()]
|
||||||
|
.into_iter()
|
||||||
|
.map(Ok),
|
||||||
|
schema,
|
||||||
|
);
|
||||||
|
|
||||||
|
let table = conn.create_table("test", batches).execute().await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
table
|
||||||
|
.as_native()
|
||||||
|
.unwrap()
|
||||||
|
.count_indexed_rows("my_index")
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
None
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
table
|
||||||
|
.as_native()
|
||||||
|
.unwrap()
|
||||||
|
.count_unindexed_rows("my_index")
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
None
|
||||||
|
);
|
||||||
|
|
||||||
|
let index = IvfHnswSqIndexBuilder::default();
|
||||||
|
table
|
||||||
|
.create_index(&["embeddings"], Index::IvfHnswSq(index))
|
||||||
|
.execute()
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let index_configs = table.list_indices().await.unwrap();
|
||||||
|
assert_eq!(index_configs.len(), 1);
|
||||||
|
let index = index_configs.into_iter().next().unwrap();
|
||||||
|
assert_eq!(index.index_type, crate::index::IndexType::IvfPq);
|
||||||
|
assert_eq!(index.columns, vec!["embeddings".to_string()]);
|
||||||
|
assert_eq!(table.count_rows(None).await.unwrap(), 512);
|
||||||
|
assert_eq!(table.name(), "test");
|
||||||
|
|
||||||
|
let indices = table.as_native().unwrap().load_indices().await.unwrap();
|
||||||
|
let index_uuid = &indices[0].index_uuid;
|
||||||
|
assert_eq!(
|
||||||
|
table
|
||||||
|
.as_native()
|
||||||
|
.unwrap()
|
||||||
|
.count_indexed_rows(index_uuid)
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
Some(512)
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
table
|
||||||
|
.as_native()
|
||||||
|
.unwrap()
|
||||||
|
.count_unindexed_rows(index_uuid)
|
||||||
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
Some(0)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
|
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
|
||||||
let list_type = DataType::FixedSizeList(
|
let list_type = DataType::FixedSizeList(
|
||||||
Arc::new(Field::new("item", values.data_type().clone(), true)),
|
Arc::new(Field::new("item", values.data_type().clone(), true)),
|
||||||
|
|||||||
Reference in New Issue
Block a user