mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-26 14:49:57 +00:00
Compare commits
1 Commits
python-v0.
...
rmeng/patc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
24526bda4c |
22
.bumpversion.cfg
Normal file
22
.bumpversion.cfg
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
[bumpversion]
|
||||||
|
current_version = 0.4.20
|
||||||
|
commit = True
|
||||||
|
message = Bump version: {current_version} → {new_version}
|
||||||
|
tag = True
|
||||||
|
tag_name = v{new_version}
|
||||||
|
|
||||||
|
[bumpversion:file:node/package.json]
|
||||||
|
|
||||||
|
[bumpversion:file:nodejs/package.json]
|
||||||
|
|
||||||
|
[bumpversion:file:nodejs/npm/darwin-x64/package.json]
|
||||||
|
|
||||||
|
[bumpversion:file:nodejs/npm/darwin-arm64/package.json]
|
||||||
|
|
||||||
|
[bumpversion:file:nodejs/npm/linux-x64-gnu/package.json]
|
||||||
|
|
||||||
|
[bumpversion:file:nodejs/npm/linux-arm64-gnu/package.json]
|
||||||
|
|
||||||
|
[bumpversion:file:rust/ffi/node/Cargo.toml]
|
||||||
|
|
||||||
|
[bumpversion:file:rust/lancedb/Cargo.toml]
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
[tool.bumpversion]
|
|
||||||
current_version = "0.5.0"
|
|
||||||
parse = """(?x)
|
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
|
||||||
(?P<patch>0|[1-9]\\d*)
|
|
||||||
(?:-(?P<pre_l>[a-zA-Z-]+)\\.(?P<pre_n>0|[1-9]\\d*))?
|
|
||||||
"""
|
|
||||||
serialize = [
|
|
||||||
"{major}.{minor}.{patch}-{pre_l}.{pre_n}",
|
|
||||||
"{major}.{minor}.{patch}",
|
|
||||||
]
|
|
||||||
search = "{current_version}"
|
|
||||||
replace = "{new_version}"
|
|
||||||
regex = false
|
|
||||||
ignore_missing_version = false
|
|
||||||
ignore_missing_files = false
|
|
||||||
tag = true
|
|
||||||
sign_tags = false
|
|
||||||
tag_name = "v{new_version}"
|
|
||||||
tag_message = "Bump version: {current_version} → {new_version}"
|
|
||||||
allow_dirty = true
|
|
||||||
commit = true
|
|
||||||
message = "Bump version: {current_version} → {new_version}"
|
|
||||||
commit_args = ""
|
|
||||||
|
|
||||||
[tool.bumpversion.parts.pre_l]
|
|
||||||
values = ["beta", "final"]
|
|
||||||
optional_value = "final"
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "node/package.json"
|
|
||||||
search = "\"version\": \"{current_version}\","
|
|
||||||
replace = "\"version\": \"{new_version}\","
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "nodejs/package.json"
|
|
||||||
search = "\"version\": \"{current_version}\","
|
|
||||||
replace = "\"version\": \"{new_version}\","
|
|
||||||
|
|
||||||
# nodejs binary packages
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
glob = "nodejs/npm/*/package.json"
|
|
||||||
search = "\"version\": \"{current_version}\","
|
|
||||||
replace = "\"version\": \"{new_version}\","
|
|
||||||
|
|
||||||
# Cargo files
|
|
||||||
# ------------
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "rust/ffi/node/Cargo.toml"
|
|
||||||
search = "\nversion = \"{current_version}\""
|
|
||||||
replace = "\nversion = \"{new_version}\""
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "rust/lancedb/Cargo.toml"
|
|
||||||
search = "\nversion = \"{current_version}\""
|
|
||||||
replace = "\nversion = \"{new_version}\""
|
|
||||||
25
.github/release.yml
vendored
Normal file
25
.github/release.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# TODO: create separate templates for Python and other releases.
|
||||||
|
changelog:
|
||||||
|
exclude:
|
||||||
|
labels:
|
||||||
|
- ci
|
||||||
|
- chore
|
||||||
|
categories:
|
||||||
|
- title: Breaking Changes 🛠
|
||||||
|
labels:
|
||||||
|
- breaking-change
|
||||||
|
- title: New Features 🎉
|
||||||
|
labels:
|
||||||
|
- enhancement
|
||||||
|
- title: Bug Fixes 🐛
|
||||||
|
labels:
|
||||||
|
- bug
|
||||||
|
- title: Documentation 📚
|
||||||
|
labels:
|
||||||
|
- documentation
|
||||||
|
- title: Performance Improvements 🚀
|
||||||
|
labels:
|
||||||
|
- performance
|
||||||
|
- title: Other Changes
|
||||||
|
labels:
|
||||||
|
- "*"
|
||||||
41
.github/release_notes.json
vendored
41
.github/release_notes.json
vendored
@@ -1,41 +0,0 @@
|
|||||||
{
|
|
||||||
"ignore_labels": ["chore"],
|
|
||||||
"pr_template": "- ${{TITLE}} by @${{AUTHOR}} in ${{URL}}",
|
|
||||||
"categories": [
|
|
||||||
{
|
|
||||||
"title": "## 🏆 Highlights",
|
|
||||||
"labels": ["highlight"]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "## 🛠 Breaking Changes",
|
|
||||||
"labels": ["breaking-change"]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "## ⚠️ Deprecations ",
|
|
||||||
"labels": ["deprecation"]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "## 🎉 New Features",
|
|
||||||
"labels": ["enhancement"]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "## 🐛 Bug Fixes",
|
|
||||||
"labels": ["bug"]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "## 📚 Documentation",
|
|
||||||
"labels": ["documentation"]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "## 🚀 Performance Improvements",
|
|
||||||
"labels": ["performance"]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "## Other Changes"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "## 🔧 Build and CI",
|
|
||||||
"labels": ["ci"]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
8
.github/workflows/cargo-publish.yml
vendored
8
.github/workflows/cargo-publish.yml
vendored
@@ -1,12 +1,8 @@
|
|||||||
name: Cargo Publish
|
name: Cargo Publish
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
release:
|
||||||
tags-ignore:
|
types: [ published ]
|
||||||
# We don't publish pre-releases for Rust. Crates.io is just a source
|
|
||||||
# distribution, so we don't need to publish pre-releases.
|
|
||||||
- 'v*-beta*'
|
|
||||||
- '*-v*' # for example, python-vX.Y.Z
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
# This env var is used by Swatinem/rust-cache@v2 for the cache
|
||||||
|
|||||||
88
.github/workflows/make-release-commit.yml
vendored
88
.github/workflows/make-release-commit.yml
vendored
@@ -1,62 +1,37 @@
|
|||||||
name: Create release commit
|
name: Create release commit
|
||||||
|
|
||||||
# This workflow increments versions, tags the version, and pushes it.
|
|
||||||
# When a tag is pushed, another workflow is triggered that creates a GH release
|
|
||||||
# and uploads the binaries. This workflow is only for creating the tag.
|
|
||||||
|
|
||||||
# This script will enforce that a minor version is incremented if there are any
|
|
||||||
# breaking changes since the last minor increment. However, it isn't able to
|
|
||||||
# differentiate between breaking changes in Node versus Python. If you wish to
|
|
||||||
# bypass this check, you can manually increment the version and push the tag.
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
dry_run:
|
dry_run:
|
||||||
description: 'Dry run (create the local commit/tags but do not push it)'
|
description: 'Dry run (create the local commit/tags but do not push it)'
|
||||||
required: true
|
required: true
|
||||||
default: false
|
default: "false"
|
||||||
type: boolean
|
|
||||||
type:
|
|
||||||
description: 'What kind of release is this?'
|
|
||||||
required: true
|
|
||||||
default: 'preview'
|
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- preview
|
- "true"
|
||||||
- stable
|
- "false"
|
||||||
python:
|
part:
|
||||||
description: 'Make a Python release'
|
description: 'What kind of release is this?'
|
||||||
required: true
|
required: true
|
||||||
default: true
|
default: 'patch'
|
||||||
type: boolean
|
type: choice
|
||||||
other:
|
options:
|
||||||
description: 'Make a Node/Rust release'
|
- patch
|
||||||
required: true
|
- minor
|
||||||
default: true
|
- major
|
||||||
type: boolean
|
|
||||||
bump-minor:
|
|
||||||
description: 'Bump minor version'
|
|
||||||
required: true
|
|
||||||
default: false
|
|
||||||
type: boolean
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
make-release:
|
bump-version:
|
||||||
# Creates tag and GH release. The GH release will trigger the build and release jobs.
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
steps:
|
steps:
|
||||||
- name: Output Inputs
|
- name: Check out main
|
||||||
run: echo "${{ toJSON(github.event.inputs) }}"
|
uses: actions/checkout@v4
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
with:
|
||||||
|
ref: main
|
||||||
|
persist-credentials: false
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
lfs: true
|
lfs: true
|
||||||
# It's important we use our token here, as the default token will NOT
|
|
||||||
# trigger any workflows watching for new tags. See:
|
|
||||||
# https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow
|
|
||||||
token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
|
||||||
- name: Set git configs for bumpversion
|
- name: Set git configs for bumpversion
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
@@ -66,34 +41,19 @@ jobs:
|
|||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- name: Bump Python version
|
- name: Bump version, create tag and commit
|
||||||
if: ${{ inputs.python }}
|
|
||||||
working-directory: python
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
run: |
|
run: |
|
||||||
# Need to get the commit before bumping the version, so we can
|
pip install bump2version
|
||||||
# determine if there are breaking changes in the next step as well.
|
bumpversion --verbose ${{ inputs.part }}
|
||||||
echo "COMMIT_BEFORE_BUMP=$(git rev-parse HEAD)" >> $GITHUB_ENV
|
- name: Push new version and tag
|
||||||
|
if: ${{ inputs.dry_run }} == "false"
|
||||||
pip install bump-my-version PyGithub packaging
|
|
||||||
bash ../ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} python-v
|
|
||||||
- name: Bump Node/Rust version
|
|
||||||
if: ${{ inputs.other }}
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
run: |
|
|
||||||
pip install bump-my-version PyGithub packaging
|
|
||||||
bash ci/bump_version.sh ${{ inputs.type }} ${{ inputs.bump-minor }} v $COMMIT_BEFORE_BUMP
|
|
||||||
- name: Push new version tag
|
|
||||||
if: ${{ !inputs.dry_run }}
|
|
||||||
uses: ad-m/github-push-action@master
|
uses: ad-m/github-push-action@master
|
||||||
with:
|
with:
|
||||||
# Need to use PAT here too to trigger next workflow. See comment above.
|
|
||||||
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
branch: ${{ github.ref }}
|
branch: main
|
||||||
tags: true
|
tags: true
|
||||||
- uses: ./.github/workflows/update_package_lock
|
- uses: ./.github/workflows/update_package_lock
|
||||||
if: ${{ inputs.dry_run }} == "false"
|
if: ${{ inputs.dry_run }} == "false"
|
||||||
with:
|
with:
|
||||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
|
|
||||||
|
|||||||
99
.github/workflows/npm-publish.yml
vendored
99
.github/workflows/npm-publish.yml
vendored
@@ -1,9 +1,8 @@
|
|||||||
name: NPM Publish
|
name: NPM Publish
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
release:
|
||||||
tags:
|
types: [published]
|
||||||
- 'v*'
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
node:
|
node:
|
||||||
@@ -275,15 +274,9 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
# Tag beta as "preview" instead of default "latest". See lancedb
|
|
||||||
# npm publish step for more info.
|
|
||||||
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
|
||||||
PUBLISH_ARGS="--tag preview"
|
|
||||||
fi
|
|
||||||
|
|
||||||
mv */*.tgz .
|
mv */*.tgz .
|
||||||
for filename in *.tgz; do
|
for filename in *.tgz; do
|
||||||
npm publish $PUBLISH_ARGS $filename
|
npm publish $filename
|
||||||
done
|
done
|
||||||
|
|
||||||
release-nodejs:
|
release-nodejs:
|
||||||
@@ -323,23 +316,11 @@ jobs:
|
|||||||
- name: Publish to NPM
|
- name: Publish to NPM
|
||||||
env:
|
env:
|
||||||
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
NODE_AUTH_TOKEN: ${{ secrets.LANCEDB_NPM_REGISTRY_TOKEN }}
|
||||||
# By default, things are published to the latest tag. This is what is
|
run: npm publish --access public
|
||||||
# installed by default if the user does not specify a version. This is
|
|
||||||
# good for stable releases, but for pre-releases, we want to publish to
|
|
||||||
# the "preview" tag so they can install with `npm install lancedb@preview`.
|
|
||||||
# See: https://medium.com/@mbostock/prereleases-and-npm-e778fc5e2420
|
|
||||||
run: |
|
|
||||||
if [[ $GITHUB_REF =~ refs/tags/v(.*)-beta.* ]]; then
|
|
||||||
npm publish --access public --tag preview
|
|
||||||
else
|
|
||||||
npm publish --access public
|
|
||||||
fi
|
|
||||||
|
|
||||||
update-package-lock:
|
update-package-lock:
|
||||||
needs: [release]
|
needs: [release]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -350,13 +331,11 @@ jobs:
|
|||||||
lfs: true
|
lfs: true
|
||||||
- uses: ./.github/workflows/update_package_lock
|
- uses: ./.github/workflows/update_package_lock
|
||||||
with:
|
with:
|
||||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
|
|
||||||
update-package-lock-nodejs:
|
update-package-lock-nodejs:
|
||||||
needs: [release-nodejs]
|
needs: [release-nodejs]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -367,70 +346,4 @@ jobs:
|
|||||||
lfs: true
|
lfs: true
|
||||||
- uses: ./.github/workflows/update_package_lock_nodejs
|
- uses: ./.github/workflows/update_package_lock_nodejs
|
||||||
with:
|
with:
|
||||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
|
|
||||||
gh-release:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- name: Extract version
|
|
||||||
id: extract_version
|
|
||||||
env:
|
|
||||||
GITHUB_REF: ${{ github.ref }}
|
|
||||||
run: |
|
|
||||||
set -e
|
|
||||||
echo "Extracting tag and version from $GITHUB_REF"
|
|
||||||
if [[ $GITHUB_REF =~ refs/tags/v(.*) ]]; then
|
|
||||||
VERSION=${BASH_REMATCH[1]}
|
|
||||||
TAG=v$VERSION
|
|
||||||
echo "tag=$TAG" >> $GITHUB_OUTPUT
|
|
||||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
||||||
else
|
|
||||||
echo "Failed to extract version from $GITHUB_REF"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Extracted version $VERSION from $GITHUB_REF"
|
|
||||||
if [[ $VERSION =~ beta ]]; then
|
|
||||||
echo "This is a beta release"
|
|
||||||
|
|
||||||
# Get last release (that is not this one)
|
|
||||||
FROM_TAG=$(git tag --sort='version:refname' \
|
|
||||||
| grep ^v \
|
|
||||||
| grep -vF "$TAG" \
|
|
||||||
| python ci/semver_sort.py v \
|
|
||||||
| tail -n 1)
|
|
||||||
else
|
|
||||||
echo "This is a stable release"
|
|
||||||
# Get last stable tag (ignore betas)
|
|
||||||
FROM_TAG=$(git tag --sort='version:refname' \
|
|
||||||
| grep ^v \
|
|
||||||
| grep -vF "$TAG" \
|
|
||||||
| grep -v beta \
|
|
||||||
| python ci/semver_sort.py v \
|
|
||||||
| tail -n 1)
|
|
||||||
fi
|
|
||||||
echo "Found from tag $FROM_TAG"
|
|
||||||
echo "from_tag=$FROM_TAG" >> $GITHUB_OUTPUT
|
|
||||||
- name: Create Release Notes
|
|
||||||
id: release_notes
|
|
||||||
uses: mikepenz/release-changelog-builder-action@v4
|
|
||||||
with:
|
|
||||||
configuration: .github/release_notes.json
|
|
||||||
toTag: ${{ steps.extract_version.outputs.tag }}
|
|
||||||
fromTag: ${{ steps.extract_version.outputs.from_tag }}
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- name: Create GH release
|
|
||||||
uses: softprops/action-gh-release@v2
|
|
||||||
with:
|
|
||||||
prerelease: ${{ contains('beta', github.ref) }}
|
|
||||||
tag_name: ${{ steps.extract_version.outputs.tag }}
|
|
||||||
token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
generate_release_notes: false
|
|
||||||
name: Node/Rust LanceDB v${{ steps.extract_version.outputs.version }}
|
|
||||||
body: ${{ steps.release_notes.outputs.changelog }}
|
|
||||||
|
|||||||
107
.github/workflows/pypi-publish.yml
vendored
107
.github/workflows/pypi-publish.yml
vendored
@@ -1,16 +1,18 @@
|
|||||||
name: PyPI Publish
|
name: PyPI Publish
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
release:
|
||||||
tags:
|
types: [published]
|
||||||
- 'python-v*'
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
linux:
|
linux:
|
||||||
|
# Only runs on tags that matches the python-make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||||
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
name: Python ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }}
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
python-minor-version: ["8"]
|
||||||
config:
|
config:
|
||||||
- platform: x86_64
|
- platform: x86_64
|
||||||
manylinux: "2_17"
|
manylinux: "2_17"
|
||||||
@@ -32,22 +34,25 @@ jobs:
|
|||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: 3.8
|
python-version: 3.${{ matrix.python-minor-version }}
|
||||||
- uses: ./.github/workflows/build_linux_wheel
|
- uses: ./.github/workflows/build_linux_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: 8
|
python-minor-version: ${{ matrix.python-minor-version }}
|
||||||
args: "--release --strip ${{ matrix.config.extra_args }}"
|
args: "--release --strip ${{ matrix.config.extra_args }}"
|
||||||
arm-build: ${{ matrix.config.platform == 'aarch64' }}
|
arm-build: ${{ matrix.config.platform == 'aarch64' }}
|
||||||
manylinux: ${{ matrix.config.manylinux }}
|
manylinux: ${{ matrix.config.manylinux }}
|
||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
with:
|
with:
|
||||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
repo: "pypi"
|
||||||
mac:
|
mac:
|
||||||
|
# Only runs on tags that matches the python-make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
runs-on: ${{ matrix.config.runner }}
|
runs-on: ${{ matrix.config.runner }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
python-minor-version: ["8"]
|
||||||
config:
|
config:
|
||||||
- target: x86_64-apple-darwin
|
- target: x86_64-apple-darwin
|
||||||
runner: macos-13
|
runner: macos-13
|
||||||
@@ -58,6 +63,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
ref: ${{ inputs.ref }}
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
lfs: true
|
lfs: true
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
@@ -66,95 +72,38 @@ jobs:
|
|||||||
python-version: 3.12
|
python-version: 3.12
|
||||||
- uses: ./.github/workflows/build_mac_wheel
|
- uses: ./.github/workflows/build_mac_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: 8
|
python-minor-version: ${{ matrix.python-minor-version }}
|
||||||
args: "--release --strip --target ${{ matrix.config.target }} --features fp16kernels"
|
args: "--release --strip --target ${{ matrix.config.target }} --features fp16kernels"
|
||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
with:
|
with:
|
||||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
python-minor-version: ${{ matrix.python-minor-version }}
|
||||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
|
repo: "pypi"
|
||||||
windows:
|
windows:
|
||||||
|
# Only runs on tags that matches the python-make-release action
|
||||||
|
if: startsWith(github.ref, 'refs/tags/python-v')
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-minor-version: ["8"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
ref: ${{ inputs.ref }}
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
lfs: true
|
lfs: true
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: 3.8
|
python-version: 3.${{ matrix.python-minor-version }}
|
||||||
- uses: ./.github/workflows/build_windows_wheel
|
- uses: ./.github/workflows/build_windows_wheel
|
||||||
with:
|
with:
|
||||||
python-minor-version: 8
|
python-minor-version: ${{ matrix.python-minor-version }}
|
||||||
args: "--release --strip"
|
args: "--release --strip"
|
||||||
vcpkg_token: ${{ secrets.VCPKG_GITHUB_PACKAGES }}
|
vcpkg_token: ${{ secrets.VCPKG_GITHUB_PACKAGES }}
|
||||||
- uses: ./.github/workflows/upload_wheel
|
- uses: ./.github/workflows/upload_wheel
|
||||||
with:
|
with:
|
||||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
python-minor-version: ${{ matrix.python-minor-version }}
|
||||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||||
gh-release:
|
repo: "pypi"
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
lfs: true
|
|
||||||
- name: Extract version
|
|
||||||
id: extract_version
|
|
||||||
env:
|
|
||||||
GITHUB_REF: ${{ github.ref }}
|
|
||||||
run: |
|
|
||||||
set -e
|
|
||||||
echo "Extracting tag and version from $GITHUB_REF"
|
|
||||||
if [[ $GITHUB_REF =~ refs/tags/python-v(.*) ]]; then
|
|
||||||
VERSION=${BASH_REMATCH[1]}
|
|
||||||
TAG=python-v$VERSION
|
|
||||||
echo "tag=$TAG" >> $GITHUB_OUTPUT
|
|
||||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
||||||
else
|
|
||||||
echo "Failed to extract version from $GITHUB_REF"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Extracted version $VERSION from $GITHUB_REF"
|
|
||||||
if [[ $VERSION =~ beta ]]; then
|
|
||||||
echo "This is a beta release"
|
|
||||||
|
|
||||||
# Get last release (that is not this one)
|
|
||||||
FROM_TAG=$(git tag --sort='version:refname' \
|
|
||||||
| grep ^python-v \
|
|
||||||
| grep -vF "$TAG" \
|
|
||||||
| python ci/semver_sort.py python-v \
|
|
||||||
| tail -n 1)
|
|
||||||
else
|
|
||||||
echo "This is a stable release"
|
|
||||||
# Get last stable tag (ignore betas)
|
|
||||||
FROM_TAG=$(git tag --sort='version:refname' \
|
|
||||||
| grep ^python-v \
|
|
||||||
| grep -vF "$TAG" \
|
|
||||||
| grep -v beta \
|
|
||||||
| python ci/semver_sort.py python-v \
|
|
||||||
| tail -n 1)
|
|
||||||
fi
|
|
||||||
echo "Found from tag $FROM_TAG"
|
|
||||||
echo "from_tag=$FROM_TAG" >> $GITHUB_OUTPUT
|
|
||||||
- name: Create Python Release Notes
|
|
||||||
id: python_release_notes
|
|
||||||
uses: mikepenz/release-changelog-builder-action@v4
|
|
||||||
with:
|
|
||||||
configuration: .github/release_notes.json
|
|
||||||
toTag: ${{ steps.extract_version.outputs.tag }}
|
|
||||||
fromTag: ${{ steps.extract_version.outputs.from_tag }}
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
- name: Create Python GH release
|
|
||||||
uses: softprops/action-gh-release@v2
|
|
||||||
with:
|
|
||||||
prerelease: ${{ contains('beta', github.ref) }}
|
|
||||||
tag_name: ${{ steps.extract_version.outputs.tag }}
|
|
||||||
token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
generate_release_notes: false
|
|
||||||
name: Python LanceDB v${{ steps.extract_version.outputs.version }}
|
|
||||||
body: ${{ steps.python_release_notes.outputs.changelog }}
|
|
||||||
|
|||||||
56
.github/workflows/python-make-release-commit.yml
vendored
Normal file
56
.github/workflows/python-make-release-commit.yml
vendored
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
name: Python - Create release commit
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
dry_run:
|
||||||
|
description: 'Dry run (create the local commit/tags but do not push it)'
|
||||||
|
required: true
|
||||||
|
default: "false"
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- "true"
|
||||||
|
- "false"
|
||||||
|
part:
|
||||||
|
description: 'What kind of release is this?'
|
||||||
|
required: true
|
||||||
|
default: 'patch'
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- patch
|
||||||
|
- minor
|
||||||
|
- major
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
bump-version:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Check out main
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: main
|
||||||
|
persist-credentials: false
|
||||||
|
fetch-depth: 0
|
||||||
|
lfs: true
|
||||||
|
- name: Set git configs for bumpversion
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
git config user.name 'Lance Release'
|
||||||
|
git config user.email 'lance-dev@lancedb.com'
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: Bump version, create tag and commit
|
||||||
|
working-directory: python
|
||||||
|
run: |
|
||||||
|
pip install bump2version
|
||||||
|
bumpversion --verbose ${{ inputs.part }}
|
||||||
|
- name: Push new version and tag
|
||||||
|
if: ${{ inputs.dry_run }} == "false"
|
||||||
|
uses: ad-m/github-push-action@master
|
||||||
|
with:
|
||||||
|
github_token: ${{ secrets.LANCEDB_RELEASE_TOKEN }}
|
||||||
|
branch: main
|
||||||
|
tags: true
|
||||||
|
|
||||||
2
.github/workflows/python.yml
vendored
2
.github/workflows/python.yml
vendored
@@ -75,7 +75,7 @@ jobs:
|
|||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-minor-version: ["9", "11"]
|
python-minor-version: ["8", "11"]
|
||||||
runs-on: "ubuntu-22.04"
|
runs-on: "ubuntu-22.04"
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
|
|||||||
4
.github/workflows/rust.yml
vendored
4
.github/workflows/rust.yml
vendored
@@ -74,11 +74,11 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
sudo apt update
|
sudo apt update
|
||||||
sudo apt install -y protobuf-compiler libssl-dev
|
sudo apt install -y protobuf-compiler libssl-dev
|
||||||
|
- name: Build
|
||||||
|
run: cargo build --all-features
|
||||||
- name: Start S3 integration test environment
|
- name: Start S3 integration test environment
|
||||||
working-directory: .
|
working-directory: .
|
||||||
run: docker compose up --detach --wait
|
run: docker compose up --detach --wait
|
||||||
- name: Build
|
|
||||||
run: cargo build --all-features
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: cargo test --all-features
|
run: cargo test --all-features
|
||||||
- name: Run examples
|
- name: Run examples
|
||||||
|
|||||||
41
.github/workflows/upload_wheel/action.yml
vendored
41
.github/workflows/upload_wheel/action.yml
vendored
@@ -2,12 +2,16 @@ name: upload-wheel
|
|||||||
|
|
||||||
description: "Upload wheels to Pypi"
|
description: "Upload wheels to Pypi"
|
||||||
inputs:
|
inputs:
|
||||||
pypi_token:
|
os:
|
||||||
|
required: true
|
||||||
|
description: "ubuntu-22.04 or macos-13"
|
||||||
|
repo:
|
||||||
|
required: false
|
||||||
|
description: "pypi or testpypi"
|
||||||
|
default: "pypi"
|
||||||
|
token:
|
||||||
required: true
|
required: true
|
||||||
description: "release token for the repo"
|
description: "release token for the repo"
|
||||||
fury_token:
|
|
||||||
required: true
|
|
||||||
description: "release token for the fury repo"
|
|
||||||
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
@@ -17,28 +21,9 @@ runs:
|
|||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install twine
|
pip install twine
|
||||||
- name: Choose repo
|
- name: Publish wheel
|
||||||
shell: bash
|
|
||||||
id: choose_repo
|
|
||||||
run: |
|
|
||||||
if [ ${{ github.ref }} == "*beta*" ]; then
|
|
||||||
echo "repo=fury" >> $GITHUB_OUTPUT
|
|
||||||
else
|
|
||||||
echo "repo=pypi" >> $GITHUB_OUTPUT
|
|
||||||
fi
|
|
||||||
- name: Publish to PyPI
|
|
||||||
shell: bash
|
|
||||||
env:
|
env:
|
||||||
FURY_TOKEN: ${{ inputs.fury_token }}
|
TWINE_USERNAME: __token__
|
||||||
PYPI_TOKEN: ${{ inputs.pypi_token }}
|
TWINE_PASSWORD: ${{ inputs.token }}
|
||||||
run: |
|
shell: bash
|
||||||
if [ ${{ steps.choose_repo.outputs.repo }} == "fury" ]; then
|
run: twine upload --repository ${{ inputs.repo }} target/wheels/lancedb-*.whl
|
||||||
WHEEL=$(ls target/wheels/lancedb-*.whl 2> /dev/null | head -n 1)
|
|
||||||
echo "Uploading $WHEEL to Fury"
|
|
||||||
curl -f -F package=@$WHEEL https://$FURY_TOKEN@push.fury.io/lancedb/
|
|
||||||
else
|
|
||||||
twine upload --repository ${{ steps.choose_repo.outputs.repo }} \
|
|
||||||
--username __token__ \
|
|
||||||
--password $PYPI_TOKEN \
|
|
||||||
target/wheels/lancedb-*.whl
|
|
||||||
fi
|
|
||||||
|
|||||||
@@ -14,10 +14,10 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
|
|||||||
categories = ["database-implementations"]
|
categories = ["database-implementations"]
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
lance = { "version" = "=0.11.0", "features" = ["dynamodb"] }
|
lance = { "version" = "=0.10.18", "features" = ["dynamodb"] }
|
||||||
lance-index = { "version" = "=0.11.0" }
|
lance-index = { "version" = "=0.10.18" }
|
||||||
lance-linalg = { "version" = "=0.11.0" }
|
lance-linalg = { "version" = "=0.10.18" }
|
||||||
lance-testing = { "version" = "=0.11.0" }
|
lance-testing = { "version" = "=0.10.18" }
|
||||||
# Note that this one does not include pyarrow
|
# Note that this one does not include pyarrow
|
||||||
arrow = { version = "51.0", optional = false }
|
arrow = { version = "51.0", optional = false }
|
||||||
arrow-array = "51.0"
|
arrow-array = "51.0"
|
||||||
|
|||||||
@@ -1,51 +0,0 @@
|
|||||||
set -e
|
|
||||||
|
|
||||||
RELEASE_TYPE=${1:-"stable"}
|
|
||||||
BUMP_MINOR=${2:-false}
|
|
||||||
TAG_PREFIX=${3:-"v"} # Such as "python-v"
|
|
||||||
HEAD_SHA=${4:-$(git rev-parse HEAD)}
|
|
||||||
|
|
||||||
readonly SELF_DIR=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
|
||||||
|
|
||||||
PREV_TAG=$(git tag --sort='version:refname' | grep ^$TAG_PREFIX | python $SELF_DIR/semver_sort.py $TAG_PREFIX | tail -n 1)
|
|
||||||
echo "Found previous tag $PREV_TAG"
|
|
||||||
|
|
||||||
# Initially, we don't want to tag if we are doing stable, because we will bump
|
|
||||||
# again later. See comment at end for why.
|
|
||||||
if [[ "$RELEASE_TYPE" == 'stable' ]]; then
|
|
||||||
BUMP_ARGS="--no-tag"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# If last is stable and not bumping minor
|
|
||||||
if [[ $PREV_TAG != *beta* ]]; then
|
|
||||||
if [[ "$BUMP_MINOR" != "false" ]]; then
|
|
||||||
# X.Y.Z -> X.(Y+1).0-beta.0
|
|
||||||
bump-my-version bump -vv $BUMP_ARGS minor
|
|
||||||
else
|
|
||||||
# X.Y.Z -> X.Y.(Z+1)-beta.0
|
|
||||||
bump-my-version bump -vv $BUMP_ARGS patch
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ "$BUMP_MINOR" != "false" ]]; then
|
|
||||||
# X.Y.Z-beta.N -> X.(Y+1).0-beta.0
|
|
||||||
bump-my-version bump -vv $BUMP_ARGS minor
|
|
||||||
else
|
|
||||||
# X.Y.Z-beta.N -> X.Y.Z-beta.(N+1)
|
|
||||||
bump-my-version bump -vv $BUMP_ARGS pre_n
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# The above bump will always bump to a pre-release version. If we are releasing
|
|
||||||
# a stable version, bump the pre-release level ("pre_l") to make it stable.
|
|
||||||
if [[ $RELEASE_TYPE == 'stable' ]]; then
|
|
||||||
# X.Y.Z-beta.N -> X.Y.Z
|
|
||||||
bump-my-version bump -vv pre_l
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Validate that we have incremented version appropriately for breaking changes
|
|
||||||
NEW_TAG=$(git describe --tags --exact-match HEAD)
|
|
||||||
NEW_VERSION=$(echo $NEW_TAG | sed "s/^$TAG_PREFIX//")
|
|
||||||
LAST_STABLE_RELEASE=$(git tag --sort='version:refname' | grep ^$TAG_PREFIX | grep -v beta | grep -vF "$NEW_TAG" | python $SELF_DIR/semver_sort.py $TAG_PREFIX | tail -n 1)
|
|
||||||
LAST_STABLE_VERSION=$(echo $LAST_STABLE_RELEASE | sed "s/^$TAG_PREFIX//")
|
|
||||||
|
|
||||||
python $SELF_DIR/check_breaking_changes.py $LAST_STABLE_RELEASE $HEAD_SHA $LAST_STABLE_VERSION $NEW_VERSION
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
"""
|
|
||||||
Check whether there are any breaking changes in the PRs between the base and head commits.
|
|
||||||
If there are, assert that we have incremented the minor version.
|
|
||||||
"""
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
from packaging.version import parse
|
|
||||||
|
|
||||||
from github import Github
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("base")
|
|
||||||
parser.add_argument("head")
|
|
||||||
parser.add_argument("last_stable_version")
|
|
||||||
parser.add_argument("current_version")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
repo = Github(os.environ["GITHUB_TOKEN"]).get_repo(os.environ["GITHUB_REPOSITORY"])
|
|
||||||
commits = repo.compare(args.base, args.head).commits
|
|
||||||
prs = (pr for commit in commits for pr in commit.get_pulls())
|
|
||||||
|
|
||||||
for pr in prs:
|
|
||||||
if any(label.name == "breaking-change" for label in pr.labels):
|
|
||||||
print(f"Breaking change in PR: {pr.html_url}")
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
print("No breaking changes found.")
|
|
||||||
exit(0)
|
|
||||||
|
|
||||||
last_stable_version = parse(args.last_stable_version)
|
|
||||||
current_version = parse(args.current_version)
|
|
||||||
if current_version.minor <= last_stable_version.minor:
|
|
||||||
print("Minor version is not greater than the last stable version.")
|
|
||||||
exit(1)
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
"""
|
|
||||||
Takes a list of semver strings and sorts them in ascending order.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from packaging.version import parse, InvalidVersion
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import argparse
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("prefix", default="v")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Read the input from stdin
|
|
||||||
lines = sys.stdin.readlines()
|
|
||||||
|
|
||||||
# Parse the versions
|
|
||||||
versions = []
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip()
|
|
||||||
try:
|
|
||||||
version_str = line.removeprefix(args.prefix)
|
|
||||||
version = parse(version_str)
|
|
||||||
except InvalidVersion:
|
|
||||||
# There are old tags that don't follow the semver format
|
|
||||||
print(f"Invalid version: {line}", file=sys.stderr)
|
|
||||||
continue
|
|
||||||
versions.append((line, version))
|
|
||||||
|
|
||||||
# Sort the versions
|
|
||||||
versions.sort(key=lambda x: x[1])
|
|
||||||
|
|
||||||
# Print the sorted versions as original strings
|
|
||||||
for line, _ in versions:
|
|
||||||
print(line)
|
|
||||||
@@ -44,36 +44,6 @@
|
|||||||
|
|
||||||
!!! info "Please also make sure you're using the same version of Arrow as in the [lancedb crate](https://github.com/lancedb/lancedb/blob/main/Cargo.toml)"
|
!!! info "Please also make sure you're using the same version of Arrow as in the [lancedb crate](https://github.com/lancedb/lancedb/blob/main/Cargo.toml)"
|
||||||
|
|
||||||
### Preview releases
|
|
||||||
|
|
||||||
Stable releases are created about every 2 weeks. For the latest features and bug
|
|
||||||
fixes, you can install the preview release. These releases receive the same
|
|
||||||
level of testing as stable releases, but are not guaranteed to be available for
|
|
||||||
more than 6 months after they are released. Once your application is stable, we
|
|
||||||
recommend switching to stable releases.
|
|
||||||
|
|
||||||
=== "Python"
|
|
||||||
|
|
||||||
```shell
|
|
||||||
pip install --pre --extra-index-url https://pypi.fury.io/lancedb/ lancedb
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "Typescript"
|
|
||||||
|
|
||||||
```shell
|
|
||||||
npm install vectordb@preview
|
|
||||||
```
|
|
||||||
|
|
||||||
=== "Rust"
|
|
||||||
|
|
||||||
We don't push preview releases to crates.io, but you can referent the tag
|
|
||||||
in GitHub within your Cargo dependencies:
|
|
||||||
|
|
||||||
```toml
|
|
||||||
[dependencies]
|
|
||||||
lancedb = { git = "https://github.com/lancedb/lancedb.git", tag = "vX.Y.Z-beta.N" }
|
|
||||||
```
|
|
||||||
|
|
||||||
## Connect to a database
|
## Connect to a database
|
||||||
|
|
||||||
=== "Python"
|
=== "Python"
|
||||||
|
|||||||
4
node/package-lock.json
generated
4
node/package-lock.json
generated
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.0",
|
"version": "0.4.20",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.0",
|
"version": "0.4.20",
|
||||||
"cpu": [
|
"cpu": [
|
||||||
"x64",
|
"x64",
|
||||||
"arm64"
|
"arm64"
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "vectordb",
|
"name": "vectordb",
|
||||||
"version": "0.5.0",
|
"version": "0.4.20",
|
||||||
"description": " Serverless, low-latency vector database for AI applications",
|
"description": " Serverless, low-latency vector database for AI applications",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
|||||||
@@ -624,6 +624,8 @@ function validateSchemaEmbeddings(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
||||||
|
console.log({ missingEmbeddingFields, embeddings });
|
||||||
|
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Table has embeddings: "${missingEmbeddingFields
|
`Table has embeddings: "${missingEmbeddingFields
|
||||||
.map((f) => f.name)
|
.map((f) => f.name)
|
||||||
@@ -631,5 +633,5 @@ function validateSchemaEmbeddings(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Schema(fields, schema.metadata);
|
return new Schema(fields);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -419,31 +419,3 @@ describe("when dealing with versioning", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("when optimizing a dataset", () => {
|
|
||||||
let tmpDir: tmp.DirResult;
|
|
||||||
let table: Table;
|
|
||||||
beforeEach(async () => {
|
|
||||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
|
||||||
const con = await connect(tmpDir.name);
|
|
||||||
table = await con.createTable("vectors", [{ id: 1 }]);
|
|
||||||
await table.add([{ id: 2 }]);
|
|
||||||
});
|
|
||||||
afterEach(() => {
|
|
||||||
tmpDir.removeCallback();
|
|
||||||
});
|
|
||||||
|
|
||||||
it("compacts files", async () => {
|
|
||||||
const stats = await table.optimize();
|
|
||||||
expect(stats.compaction.filesAdded).toBe(1);
|
|
||||||
expect(stats.compaction.filesRemoved).toBe(2);
|
|
||||||
expect(stats.compaction.fragmentsAdded).toBe(1);
|
|
||||||
expect(stats.compaction.fragmentsRemoved).toBe(2);
|
|
||||||
});
|
|
||||||
|
|
||||||
it("cleanups old versions", async () => {
|
|
||||||
const stats = await table.optimize({ cleanupOlderThan: new Date() });
|
|
||||||
expect(stats.prune.bytesRemoved).toBeGreaterThan(0);
|
|
||||||
expect(stats.prune.oldVersionsRemoved).toBe(3);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|||||||
@@ -677,6 +677,8 @@ function validateSchemaEmbeddings(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
if (missingEmbeddingFields.length > 0 && embeddings === undefined) {
|
||||||
|
console.log({ missingEmbeddingFields, embeddings });
|
||||||
|
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Table has embeddings: "${missingEmbeddingFields
|
`Table has embeddings: "${missingEmbeddingFields
|
||||||
.map((f) => f.name)
|
.map((f) => f.name)
|
||||||
@@ -684,5 +686,5 @@ function validateSchemaEmbeddings(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Schema(fields, schema.metadata);
|
return new Schema(fields);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ import {
|
|||||||
AddColumnsSql,
|
AddColumnsSql,
|
||||||
ColumnAlteration,
|
ColumnAlteration,
|
||||||
IndexConfig,
|
IndexConfig,
|
||||||
OptimizeStats,
|
|
||||||
Table as _NativeTable,
|
Table as _NativeTable,
|
||||||
} from "./native";
|
} from "./native";
|
||||||
import { Query, VectorQuery } from "./query";
|
import { Query, VectorQuery } from "./query";
|
||||||
@@ -51,23 +50,6 @@ export interface UpdateOptions {
|
|||||||
where: string;
|
where: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface OptimizeOptions {
|
|
||||||
/**
|
|
||||||
* If set then all versions older than the given date
|
|
||||||
* be removed. The current version will never be removed.
|
|
||||||
* The default is 7 days
|
|
||||||
* @example
|
|
||||||
* // Delete all versions older than 1 day
|
|
||||||
* const olderThan = new Date();
|
|
||||||
* olderThan.setDate(olderThan.getDate() - 1));
|
|
||||||
* tbl.cleanupOlderVersions(olderThan);
|
|
||||||
*
|
|
||||||
* // Delete all versions except the current version
|
|
||||||
* tbl.cleanupOlderVersions(new Date());
|
|
||||||
*/
|
|
||||||
cleanupOlderThan: Date;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A Table is a collection of Records in a LanceDB Database.
|
* A Table is a collection of Records in a LanceDB Database.
|
||||||
*
|
*
|
||||||
@@ -370,48 +352,6 @@ export class Table {
|
|||||||
await this.inner.restore();
|
await this.inner.restore();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Optimize the on-disk data and indices for better performance.
|
|
||||||
*
|
|
||||||
* Modeled after ``VACUUM`` in PostgreSQL.
|
|
||||||
*
|
|
||||||
* Optimization covers three operations:
|
|
||||||
*
|
|
||||||
* - Compaction: Merges small files into larger ones
|
|
||||||
* - Prune: Removes old versions of the dataset
|
|
||||||
* - Index: Optimizes the indices, adding new data to existing indices
|
|
||||||
*
|
|
||||||
*
|
|
||||||
* Experimental API
|
|
||||||
* ----------------
|
|
||||||
*
|
|
||||||
* The optimization process is undergoing active development and may change.
|
|
||||||
* Our goal with these changes is to improve the performance of optimization and
|
|
||||||
* reduce the complexity.
|
|
||||||
*
|
|
||||||
* That being said, it is essential today to run optimize if you want the best
|
|
||||||
* performance. It should be stable and safe to use in production, but it our
|
|
||||||
* hope that the API may be simplified (or not even need to be called) in the
|
|
||||||
* future.
|
|
||||||
*
|
|
||||||
* The frequency an application shoudl call optimize is based on the frequency of
|
|
||||||
* data modifications. If data is frequently added, deleted, or updated then
|
|
||||||
* optimize should be run frequently. A good rule of thumb is to run optimize if
|
|
||||||
* you have added or modified 100,000 or more records or run more than 20 data
|
|
||||||
* modification operations.
|
|
||||||
*/
|
|
||||||
async optimize(options?: Partial<OptimizeOptions>): Promise<OptimizeStats> {
|
|
||||||
let cleanupOlderThanMs;
|
|
||||||
if (
|
|
||||||
options?.cleanupOlderThan !== undefined &&
|
|
||||||
options?.cleanupOlderThan !== null
|
|
||||||
) {
|
|
||||||
cleanupOlderThanMs =
|
|
||||||
new Date().getTime() - options.cleanupOlderThan.getTime();
|
|
||||||
}
|
|
||||||
return await this.inner.optimize(cleanupOlderThanMs);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** List all indices that have been created with {@link Table.createIndex} */
|
/** List all indices that have been created with {@link Table.createIndex} */
|
||||||
async listIndices(): Promise<IndexConfig[]> {
|
async listIndices(): Promise<IndexConfig[]> {
|
||||||
return await this.inner.listIndices();
|
return await this.inner.listIndices();
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-arm64",
|
"name": "@lancedb/lancedb-darwin-arm64",
|
||||||
"version": "0.5.0",
|
"version": "0.4.20",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.darwin-arm64.node",
|
"main": "lancedb.darwin-arm64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-darwin-x64",
|
"name": "@lancedb/lancedb-darwin-x64",
|
||||||
"version": "0.5.0",
|
"version": "0.4.20",
|
||||||
"os": ["darwin"],
|
"os": ["darwin"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.darwin-x64.node",
|
"main": "lancedb.darwin-x64.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||||
"version": "0.5.0",
|
"version": "0.4.20",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["arm64"],
|
"cpu": ["arm64"],
|
||||||
"main": "lancedb.linux-arm64-gnu.node",
|
"main": "lancedb.linux-arm64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||||
"version": "0.5.0",
|
"version": "0.4.20",
|
||||||
"os": ["linux"],
|
"os": ["linux"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.linux-x64-gnu.node",
|
"main": "lancedb.linux-x64-gnu.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||||
"version": "0.5.0",
|
"version": "0.4.14",
|
||||||
"os": ["win32"],
|
"os": ["win32"],
|
||||||
"cpu": ["x64"],
|
"cpu": ["x64"],
|
||||||
"main": "lancedb.win32-x64-msvc.node",
|
"main": "lancedb.win32-x64-msvc.node",
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@lancedb/lancedb",
|
"name": "@lancedb/lancedb",
|
||||||
"version": "0.5.0",
|
"version": "0.4.20",
|
||||||
"main": "./dist/index.js",
|
"main": "./dist/index.js",
|
||||||
"types": "./dist/index.d.ts",
|
"types": "./dist/index.d.ts",
|
||||||
"napi": {
|
"napi": {
|
||||||
|
|||||||
@@ -15,8 +15,8 @@
|
|||||||
use arrow_ipc::writer::FileWriter;
|
use arrow_ipc::writer::FileWriter;
|
||||||
use lancedb::ipc::ipc_file_to_batches;
|
use lancedb::ipc::ipc_file_to_batches;
|
||||||
use lancedb::table::{
|
use lancedb::table::{
|
||||||
AddDataMode, ColumnAlteration as LanceColumnAlteration, Duration, NewColumnTransform,
|
AddDataMode, ColumnAlteration as LanceColumnAlteration, NewColumnTransform,
|
||||||
OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
Table as LanceDbTable,
|
||||||
};
|
};
|
||||||
use napi::bindgen_prelude::*;
|
use napi::bindgen_prelude::*;
|
||||||
use napi_derive::napi;
|
use napi_derive::napi;
|
||||||
@@ -263,60 +263,6 @@ impl Table {
|
|||||||
self.inner_ref()?.restore().await.default_error()
|
self.inner_ref()?.restore().await.default_error()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[napi]
|
|
||||||
pub async fn optimize(&self, older_than_ms: Option<i64>) -> napi::Result<OptimizeStats> {
|
|
||||||
let inner = self.inner_ref()?;
|
|
||||||
|
|
||||||
let older_than = if let Some(ms) = older_than_ms {
|
|
||||||
if ms == i64::MIN {
|
|
||||||
return Err(napi::Error::from_reason(format!(
|
|
||||||
"older_than_ms can not be {}",
|
|
||||||
i32::MIN,
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
Duration::try_milliseconds(ms)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
|
|
||||||
let compaction_stats = inner
|
|
||||||
.optimize(OptimizeAction::Compact {
|
|
||||||
options: lancedb::table::CompactionOptions::default(),
|
|
||||||
remap_options: None,
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.default_error()?
|
|
||||||
.compaction
|
|
||||||
.unwrap();
|
|
||||||
let prune_stats = inner
|
|
||||||
.optimize(OptimizeAction::Prune {
|
|
||||||
older_than,
|
|
||||||
delete_unverified: None,
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.default_error()?
|
|
||||||
.prune
|
|
||||||
.unwrap();
|
|
||||||
inner
|
|
||||||
.optimize(lancedb::table::OptimizeAction::Index(
|
|
||||||
OptimizeOptions::default(),
|
|
||||||
))
|
|
||||||
.await
|
|
||||||
.default_error()?;
|
|
||||||
Ok(OptimizeStats {
|
|
||||||
compaction: CompactionStats {
|
|
||||||
files_added: compaction_stats.files_added as i64,
|
|
||||||
files_removed: compaction_stats.files_removed as i64,
|
|
||||||
fragments_added: compaction_stats.fragments_added as i64,
|
|
||||||
fragments_removed: compaction_stats.fragments_removed as i64,
|
|
||||||
},
|
|
||||||
prune: RemovalStats {
|
|
||||||
bytes_removed: prune_stats.bytes_removed as i64,
|
|
||||||
old_versions_removed: prune_stats.old_versions as i64,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
#[napi]
|
#[napi]
|
||||||
pub async fn list_indices(&self) -> napi::Result<Vec<IndexConfig>> {
|
pub async fn list_indices(&self) -> napi::Result<Vec<IndexConfig>> {
|
||||||
Ok(self
|
Ok(self
|
||||||
@@ -352,40 +298,6 @@ impl From<lancedb::index::IndexConfig> for IndexConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Statistics about a compaction operation.
|
|
||||||
#[napi(object)]
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct CompactionStats {
|
|
||||||
/// The number of fragments removed
|
|
||||||
pub fragments_removed: i64,
|
|
||||||
/// The number of new, compacted fragments added
|
|
||||||
pub fragments_added: i64,
|
|
||||||
/// The number of data files removed
|
|
||||||
pub files_removed: i64,
|
|
||||||
/// The number of new, compacted data files added
|
|
||||||
pub files_added: i64,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Statistics about a cleanup operation
|
|
||||||
#[napi(object)]
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct RemovalStats {
|
|
||||||
/// The number of bytes removed
|
|
||||||
pub bytes_removed: i64,
|
|
||||||
/// The number of old versions removed
|
|
||||||
pub old_versions_removed: i64,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Statistics about an optimize operation
|
|
||||||
#[napi(object)]
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct OptimizeStats {
|
|
||||||
/// Statistics about the compaction operation
|
|
||||||
pub compaction: CompactionStats,
|
|
||||||
/// Statistics about the removal operation
|
|
||||||
pub prune: RemovalStats,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A definition of a column alteration. The alteration changes the column at
|
/// A definition of a column alteration. The alteration changes the column at
|
||||||
/// `path` to have the new name `name`, to be nullable if `nullable` is true,
|
/// `path` to have the new name `name`, to be nullable if `nullable` is true,
|
||||||
/// and to have the data type `data_type`. At least one of `rename` or `nullable`
|
/// and to have the data type `data_type`. At least one of `rename` or `nullable`
|
||||||
|
|||||||
8
python/.bumpversion.cfg
Normal file
8
python/.bumpversion.cfg
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
[bumpversion]
|
||||||
|
current_version = 0.6.13
|
||||||
|
commit = True
|
||||||
|
message = [python] Bump version: {current_version} → {new_version}
|
||||||
|
tag = True
|
||||||
|
tag_name = python-v{new_version}
|
||||||
|
|
||||||
|
[bumpversion:file:pyproject.toml]
|
||||||
@@ -1,34 +0,0 @@
|
|||||||
[tool.bumpversion]
|
|
||||||
current_version = "0.8.0"
|
|
||||||
parse = """(?x)
|
|
||||||
(?P<major>0|[1-9]\\d*)\\.
|
|
||||||
(?P<minor>0|[1-9]\\d*)\\.
|
|
||||||
(?P<patch>0|[1-9]\\d*)
|
|
||||||
(?:-(?P<pre_l>[a-zA-Z-]+)\\.(?P<pre_n>0|[1-9]\\d*))?
|
|
||||||
"""
|
|
||||||
serialize = [
|
|
||||||
"{major}.{minor}.{patch}-{pre_l}.{pre_n}",
|
|
||||||
"{major}.{minor}.{patch}",
|
|
||||||
]
|
|
||||||
search = "{current_version}"
|
|
||||||
replace = "{new_version}"
|
|
||||||
regex = false
|
|
||||||
ignore_missing_version = false
|
|
||||||
ignore_missing_files = false
|
|
||||||
tag = true
|
|
||||||
sign_tags = false
|
|
||||||
tag_name = "python-v{new_version}"
|
|
||||||
tag_message = "Bump version: {current_version} → {new_version}"
|
|
||||||
allow_dirty = true
|
|
||||||
commit = true
|
|
||||||
message = "Bump version: {current_version} → {new_version}"
|
|
||||||
commit_args = ""
|
|
||||||
|
|
||||||
[tool.bumpversion.parts.pre_l]
|
|
||||||
values = ["beta", "final"]
|
|
||||||
optional_value = "final"
|
|
||||||
|
|
||||||
[[tool.bumpversion.files]]
|
|
||||||
filename = "Cargo.toml"
|
|
||||||
search = "\nversion = \"{current_version}\""
|
|
||||||
replace = "\nversion = \"{new_version}\""
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-python"
|
name = "lancedb-python"
|
||||||
version = "0.8.0"
|
version = "0.4.10"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "Python bindings for LanceDB"
|
description = "Python bindings for LanceDB"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
# version in Cargo.toml
|
version = "0.6.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"deprecation",
|
"deprecation",
|
||||||
"pylance==0.11.0",
|
"pylance==0.10.12",
|
||||||
"ratelimiter~=1.0",
|
"ratelimiter~=1.0",
|
||||||
"requests>=2.31.0",
|
"requests>=2.31.0",
|
||||||
"retry>=0.9.2",
|
"retry>=0.9.2",
|
||||||
|
|||||||
@@ -86,17 +86,3 @@ class VectorQuery:
|
|||||||
def refine_factor(self, refine_factor: int): ...
|
def refine_factor(self, refine_factor: int): ...
|
||||||
def nprobes(self, nprobes: int): ...
|
def nprobes(self, nprobes: int): ...
|
||||||
def bypass_vector_index(self): ...
|
def bypass_vector_index(self): ...
|
||||||
|
|
||||||
class CompactionStats:
|
|
||||||
fragments_removed: int
|
|
||||||
fragments_added: int
|
|
||||||
files_removed: int
|
|
||||||
files_added: int
|
|
||||||
|
|
||||||
class RemovalStats:
|
|
||||||
bytes_removed: int
|
|
||||||
old_versions_removed: int
|
|
||||||
|
|
||||||
class OptimizeStats:
|
|
||||||
compaction: CompactionStats
|
|
||||||
prune: RemovalStats
|
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ if TYPE_CHECKING:
|
|||||||
import PIL
|
import PIL
|
||||||
from lance.dataset import CleanupStats, ReaderLike
|
from lance.dataset import CleanupStats, ReaderLike
|
||||||
|
|
||||||
from ._lancedb import Table as LanceDBTable, OptimizeStats
|
from ._lancedb import Table as LanceDBTable
|
||||||
from .db import LanceDBConnection
|
from .db import LanceDBConnection
|
||||||
from .index import BTree, IndexConfig, IvfPq
|
from .index import BTree, IndexConfig, IvfPq
|
||||||
|
|
||||||
@@ -2377,49 +2377,6 @@ class AsyncTable:
|
|||||||
"""
|
"""
|
||||||
await self._inner.restore()
|
await self._inner.restore()
|
||||||
|
|
||||||
async def optimize(
|
|
||||||
self, *, cleanup_older_than: Optional[timedelta] = None
|
|
||||||
) -> OptimizeStats:
|
|
||||||
"""
|
|
||||||
Optimize the on-disk data and indices for better performance.
|
|
||||||
|
|
||||||
Modeled after ``VACUUM`` in PostgreSQL.
|
|
||||||
|
|
||||||
Optimization covers three operations:
|
|
||||||
|
|
||||||
* Compaction: Merges small files into larger ones
|
|
||||||
* Prune: Removes old versions of the dataset
|
|
||||||
* Index: Optimizes the indices, adding new data to existing indices
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
cleanup_older_than: timedelta, optional default 7 days
|
|
||||||
All files belonging to versions older than this will be removed. Set
|
|
||||||
to 0 days to remove all versions except the latest. The latest version
|
|
||||||
is never removed.
|
|
||||||
|
|
||||||
Experimental API
|
|
||||||
----------------
|
|
||||||
|
|
||||||
The optimization process is undergoing active development and may change.
|
|
||||||
Our goal with these changes is to improve the performance of optimization and
|
|
||||||
reduce the complexity.
|
|
||||||
|
|
||||||
That being said, it is essential today to run optimize if you want the best
|
|
||||||
performance. It should be stable and safe to use in production, but it our
|
|
||||||
hope that the API may be simplified (or not even need to be called) in the
|
|
||||||
future.
|
|
||||||
|
|
||||||
The frequency an application shoudl call optimize is based on the frequency of
|
|
||||||
data modifications. If data is frequently added, deleted, or updated then
|
|
||||||
optimize should be run frequently. A good rule of thumb is to run optimize if
|
|
||||||
you have added or modified 100,000 or more records or run more than 20 data
|
|
||||||
modification operations.
|
|
||||||
"""
|
|
||||||
if cleanup_older_than is not None:
|
|
||||||
cleanup_older_than = round(cleanup_older_than.total_seconds() * 1000)
|
|
||||||
return await self._inner.optimize(cleanup_older_than)
|
|
||||||
|
|
||||||
async def list_indices(self) -> IndexConfig:
|
async def list_indices(self) -> IndexConfig:
|
||||||
"""
|
"""
|
||||||
List all indices that have been created with Self::create_index
|
List all indices that have been created with Self::create_index
|
||||||
|
|||||||
@@ -1025,29 +1025,3 @@ async def test_time_travel(db_async: AsyncConnection):
|
|||||||
# Can't use restore if not checked out
|
# Can't use restore if not checked out
|
||||||
with pytest.raises(ValueError, match="checkout before running restore"):
|
with pytest.raises(ValueError, match="checkout before running restore"):
|
||||||
await table.restore()
|
await table.restore()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_optimize(db_async: AsyncConnection):
|
|
||||||
table = await db_async.create_table(
|
|
||||||
"test",
|
|
||||||
data=[{"x": [1]}],
|
|
||||||
)
|
|
||||||
await table.add(
|
|
||||||
data=[
|
|
||||||
{"x": [2]},
|
|
||||||
],
|
|
||||||
)
|
|
||||||
stats = await table.optimize()
|
|
||||||
assert stats.compaction.files_removed == 2
|
|
||||||
assert stats.compaction.files_added == 1
|
|
||||||
assert stats.compaction.fragments_added == 1
|
|
||||||
assert stats.compaction.fragments_removed == 2
|
|
||||||
assert stats.prune.bytes_removed == 0
|
|
||||||
assert stats.prune.old_versions_removed == 0
|
|
||||||
|
|
||||||
stats = await table.optimize(cleanup_older_than=timedelta(seconds=0))
|
|
||||||
assert stats.prune.bytes_removed > 0
|
|
||||||
assert stats.prune.old_versions_removed == 3
|
|
||||||
|
|
||||||
assert await table.query().to_arrow() == pa.table({"x": [[1], [2]]})
|
|
||||||
|
|||||||
@@ -2,9 +2,7 @@ use arrow::{
|
|||||||
ffi_stream::ArrowArrayStreamReader,
|
ffi_stream::ArrowArrayStreamReader,
|
||||||
pyarrow::{FromPyArrow, ToPyArrow},
|
pyarrow::{FromPyArrow, ToPyArrow},
|
||||||
};
|
};
|
||||||
use lancedb::table::{
|
use lancedb::table::{AddDataMode, Table as LanceDbTable};
|
||||||
AddDataMode, Duration, OptimizeAction, OptimizeOptions, Table as LanceDbTable,
|
|
||||||
};
|
|
||||||
use pyo3::{
|
use pyo3::{
|
||||||
exceptions::{PyRuntimeError, PyValueError},
|
exceptions::{PyRuntimeError, PyValueError},
|
||||||
pyclass, pymethods,
|
pyclass, pymethods,
|
||||||
@@ -19,40 +17,6 @@ use crate::{
|
|||||||
query::Query,
|
query::Query,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Statistics about a compaction operation.
|
|
||||||
#[pyclass(get_all)]
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct CompactionStats {
|
|
||||||
/// The number of fragments removed
|
|
||||||
pub fragments_removed: u64,
|
|
||||||
/// The number of new, compacted fragments added
|
|
||||||
pub fragments_added: u64,
|
|
||||||
/// The number of data files removed
|
|
||||||
pub files_removed: u64,
|
|
||||||
/// The number of new, compacted data files added
|
|
||||||
pub files_added: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Statistics about a cleanup operation
|
|
||||||
#[pyclass(get_all)]
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct RemovalStats {
|
|
||||||
/// The number of bytes removed
|
|
||||||
pub bytes_removed: u64,
|
|
||||||
/// The number of old versions removed
|
|
||||||
pub old_versions_removed: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Statistics about an optimize operation
|
|
||||||
#[pyclass(get_all)]
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct OptimizeStats {
|
|
||||||
/// Statistics about the compaction operation
|
|
||||||
pub compaction: CompactionStats,
|
|
||||||
/// Statistics about the removal operation
|
|
||||||
pub prune: RemovalStats,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct Table {
|
pub struct Table {
|
||||||
// We keep a copy of the name to use if the inner table is dropped
|
// We keep a copy of the name to use if the inner table is dropped
|
||||||
@@ -227,58 +191,4 @@ impl Table {
|
|||||||
pub fn query(&self) -> Query {
|
pub fn query(&self) -> Query {
|
||||||
Query::new(self.inner_ref().unwrap().query())
|
Query::new(self.inner_ref().unwrap().query())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn optimize(self_: PyRef<'_, Self>, cleanup_since_ms: Option<u64>) -> PyResult<&PyAny> {
|
|
||||||
let inner = self_.inner_ref()?.clone();
|
|
||||||
let older_than = if let Some(ms) = cleanup_since_ms {
|
|
||||||
if ms > i64::MAX as u64 {
|
|
||||||
return Err(PyValueError::new_err(format!(
|
|
||||||
"cleanup_since_ms must be between {} and -{}",
|
|
||||||
i32::MAX,
|
|
||||||
i32::MAX
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
Duration::try_milliseconds(ms as i64)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
};
|
|
||||||
future_into_py(self_.py(), async move {
|
|
||||||
let compaction_stats = inner
|
|
||||||
.optimize(OptimizeAction::Compact {
|
|
||||||
options: lancedb::table::CompactionOptions::default(),
|
|
||||||
remap_options: None,
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.infer_error()?
|
|
||||||
.compaction
|
|
||||||
.unwrap();
|
|
||||||
let prune_stats = inner
|
|
||||||
.optimize(OptimizeAction::Prune {
|
|
||||||
older_than,
|
|
||||||
delete_unverified: None,
|
|
||||||
})
|
|
||||||
.await
|
|
||||||
.infer_error()?
|
|
||||||
.prune
|
|
||||||
.unwrap();
|
|
||||||
inner
|
|
||||||
.optimize(lancedb::table::OptimizeAction::Index(
|
|
||||||
OptimizeOptions::default(),
|
|
||||||
))
|
|
||||||
.await
|
|
||||||
.infer_error()?;
|
|
||||||
Ok(OptimizeStats {
|
|
||||||
compaction: CompactionStats {
|
|
||||||
files_added: compaction_stats.files_added as u64,
|
|
||||||
files_removed: compaction_stats.files_removed as u64,
|
|
||||||
fragments_added: compaction_stats.fragments_added as u64,
|
|
||||||
fragments_removed: compaction_stats.fragments_removed as u64,
|
|
||||||
},
|
|
||||||
prune: RemovalStats {
|
|
||||||
bytes_removed: prune_stats.bytes_removed,
|
|
||||||
old_versions_removed: prune_stats.old_versions,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,51 +8,6 @@ The Python package is versioned and released separately from the Rust and Node.j
|
|||||||
ones. For Rust and Node.js, the release process is shared between `lancedb` and
|
ones. For Rust and Node.js, the release process is shared between `lancedb` and
|
||||||
`vectordb` for now.
|
`vectordb` for now.
|
||||||
|
|
||||||
## Preview releases
|
|
||||||
|
|
||||||
LanceDB has full releases about every 2 weeks, but in between we make frequent
|
|
||||||
preview releases. These are released as `0.x.y.betaN` versions. They receive the
|
|
||||||
same level of testing as normal releases and let you get access to the latest
|
|
||||||
features. However, we do not guarantee that preview releases will be available
|
|
||||||
more than 6 months after they are released. We may delete the preview releases
|
|
||||||
from the packaging index after a while. Once your application is stable, we
|
|
||||||
recommend switching to full releases, which will never be removed from package
|
|
||||||
indexes.
|
|
||||||
|
|
||||||
## Making releases
|
|
||||||
|
|
||||||
The release process uses a handful of GitHub actions to automate the process.
|
|
||||||
|
|
||||||
```text
|
|
||||||
┌─────────────────────┐
|
|
||||||
│Create Release Commit│
|
|
||||||
└─┬───────────────────┘
|
|
||||||
│ ┌────────────┐ ┌──►Python GH Release
|
|
||||||
├──►(tag) python-vX.Y.Z ───►│PyPI Publish├─┤
|
|
||||||
│ └────────────┘ └──►Python Wheels
|
|
||||||
│
|
|
||||||
│ ┌───────────┐
|
|
||||||
└──►(tag) vX.Y.Z ───┬──────►│NPM Publish├──┬──►Rust/Node GH Release
|
|
||||||
│ └───────────┘ │
|
|
||||||
│ └──►NPM Packages
|
|
||||||
│ ┌─────────────┐
|
|
||||||
└──────►│Cargo Publish├───►Cargo Release
|
|
||||||
└─────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
To start a release, trigger a `Create Release Commit` action from
|
|
||||||
[the workflows page](https://github.com/lancedb/lancedb/actions/workflows/make-release-commit.yml)
|
|
||||||
(Click on "Run workflow").
|
|
||||||
|
|
||||||
* **For a preview release**, leave the default parameters.
|
|
||||||
* **For a stable release**, set the `release_type` input to `stable`.
|
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> If there was a breaking change since the last stable release, and we haven't
|
|
||||||
> done so yet, we should increment the minor version. The CI will detect if this
|
|
||||||
> is needed and fail the `Create Release Commit` job. To fix, select the
|
|
||||||
> "bump minor version" option.
|
|
||||||
|
|
||||||
## Breaking changes
|
## Breaking changes
|
||||||
|
|
||||||
We try to avoid breaking changes, but sometimes they are necessary. When there
|
We try to avoid breaking changes, but sometimes they are necessary. When there
|
||||||
@@ -66,10 +21,12 @@ body of the PR. A CI job will add a `breaking-change` label to the PR, which is
|
|||||||
what will ultimately be used to CI to determine if the minor version should be
|
what will ultimately be used to CI to determine if the minor version should be
|
||||||
incremented.
|
incremented.
|
||||||
|
|
||||||
> [!IMPORTANT]
|
A CI job will validate that if a `breaking-change` label is added, the minor
|
||||||
> Reviewers should check that PRs with breaking changes receive the `breaking-change`
|
version is incremented in the `Cargo.toml` and `pyproject.toml` files. The only
|
||||||
> label. If a PR is missing the label, please add it, even if after it was merged.
|
exception is if it has already been incremented since the last stable release.
|
||||||
> This label is used in the release process.
|
|
||||||
|
**It is the responsibility of the PR author to increment the minor version when
|
||||||
|
appropriate.**
|
||||||
|
|
||||||
Some things that are considered breaking changes:
|
Some things that are considered breaking changes:
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb-node"
|
name = "lancedb-node"
|
||||||
version = "0.5.0"
|
version = "0.4.20"
|
||||||
description = "Serverless, low-latency vector database for AI applications"
|
description = "Serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
|
|||||||
@@ -19,12 +19,10 @@ use snafu::Snafu;
|
|||||||
|
|
||||||
#[derive(Debug, Snafu)]
|
#[derive(Debug, Snafu)]
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
#[allow(dead_code)]
|
|
||||||
#[snafu(display("column '{name}' is missing"))]
|
#[snafu(display("column '{name}' is missing"))]
|
||||||
MissingColumn { name: String },
|
MissingColumn { name: String },
|
||||||
#[snafu(display("{name}: {message}"))]
|
#[snafu(display("{name}: {message}"))]
|
||||||
OutOfRange { name: String, message: String },
|
OutOfRange { name: String, message: String },
|
||||||
#[allow(dead_code)]
|
|
||||||
#[snafu(display("{index_type} is not a valid index type"))]
|
#[snafu(display("{index_type} is not a valid index type"))]
|
||||||
InvalidIndexType { index_type: String },
|
InvalidIndexType { index_type: String },
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ use neon::prelude::*;
|
|||||||
pub trait JsObjectExt {
|
pub trait JsObjectExt {
|
||||||
fn get_opt_u32(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<u32>>;
|
fn get_opt_u32(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<u32>>;
|
||||||
fn get_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<usize>;
|
fn get_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<usize>;
|
||||||
#[allow(dead_code)]
|
|
||||||
fn get_opt_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<usize>>;
|
fn get_opt_usize(&self, cx: &mut FunctionContext, key: &str) -> Result<Option<usize>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -324,7 +324,7 @@ impl JsTable {
|
|||||||
rt.spawn(async move {
|
rt.spawn(async move {
|
||||||
let stats = table
|
let stats = table
|
||||||
.optimize(OptimizeAction::Prune {
|
.optimize(OptimizeAction::Prune {
|
||||||
older_than: Some(older_than),
|
older_than,
|
||||||
delete_unverified,
|
delete_unverified,
|
||||||
})
|
})
|
||||||
.await;
|
.await;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "lancedb"
|
name = "lancedb"
|
||||||
version = "0.5.0"
|
version = "0.4.20"
|
||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
@@ -40,8 +40,8 @@ serde = { version = "^1" }
|
|||||||
serde_json = { version = "1" }
|
serde_json = { version = "1" }
|
||||||
# For remote feature
|
# For remote feature
|
||||||
reqwest = { version = "0.11.24", features = ["gzip", "json"], optional = true }
|
reqwest = { version = "0.11.24", features = ["gzip", "json"], optional = true }
|
||||||
polars-arrow = { version = ">=0.37,<0.40.0", optional = true }
|
polars-arrow = { version = ">=0.37", optional = true }
|
||||||
polars = { version = ">=0.37,<0.40.0", optional = true}
|
polars = { version = ">=0.37", optional = true}
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3.5.0"
|
tempfile = "3.5.0"
|
||||||
@@ -49,12 +49,9 @@ rand = { version = "0.8.3", features = ["small_rng"] }
|
|||||||
uuid = { version = "1.7.0", features = ["v4"] }
|
uuid = { version = "1.7.0", features = ["v4"] }
|
||||||
walkdir = "2"
|
walkdir = "2"
|
||||||
# For s3 integration tests (dev deps aren't allowed to be optional atm)
|
# For s3 integration tests (dev deps aren't allowed to be optional atm)
|
||||||
# We pin these because the content-length check breaks with localstack
|
aws-sdk-s3 = { version = "1.0" }
|
||||||
# https://github.com/smithy-lang/smithy-rs/releases/tag/release-2024-05-21
|
aws-sdk-kms = { version = "1.0" }
|
||||||
aws-sdk-s3 = { version = "=1.23.0" }
|
|
||||||
aws-sdk-kms = { version = "=1.21.0" }
|
|
||||||
aws-config = { version = "1.0" }
|
aws-config = { version = "1.0" }
|
||||||
aws-smithy-runtime = { version = "=1.3.0" }
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
|
|||||||
@@ -195,7 +195,7 @@ impl<T: IntoArrow> CreateTableBuilder<true, T> {
|
|||||||
.embedding_registry()
|
.embedding_registry()
|
||||||
.get(&definition.embedding_name)
|
.get(&definition.embedding_name)
|
||||||
.ok_or_else(|| Error::EmbeddingFunctionNotFound {
|
.ok_or_else(|| Error::EmbeddingFunctionNotFound {
|
||||||
name: definition.embedding_name.clone(),
|
name: definition.embedding_name.to_string(),
|
||||||
reason: "No embedding function found in the connection's embedding_registry"
|
reason: "No embedding function found in the connection's embedding_registry"
|
||||||
.to_string(),
|
.to_string(),
|
||||||
})?;
|
})?;
|
||||||
|
|||||||
@@ -155,7 +155,7 @@ impl<R: RecordBatchReader> MaybeEmbedded<R> {
|
|||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
return Err(Error::EmbeddingFunctionNotFound {
|
return Err(Error::EmbeddingFunctionNotFound {
|
||||||
name: embedding_def.embedding_name.clone(),
|
name: embedding_def.embedding_name.to_string(),
|
||||||
reason: format!(
|
reason: format!(
|
||||||
"Table was defined with an embedding column `{}` but no embedding function was found with that name within the registry.",
|
"Table was defined with an embedding column `{}` but no embedding function was found with that name within the registry.",
|
||||||
embedding_def.embedding_name
|
embedding_def.embedding_name
|
||||||
|
|||||||
@@ -16,10 +16,7 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use crate::{table::TableInternal, Result};
|
use crate::{table::TableInternal, Result};
|
||||||
|
|
||||||
use self::{
|
use self::{scalar::BTreeIndexBuilder, vector::IvfPqIndexBuilder};
|
||||||
scalar::BTreeIndexBuilder,
|
|
||||||
vector::{IvfHnswSqIndexBuilder, IvfPqIndexBuilder},
|
|
||||||
};
|
|
||||||
|
|
||||||
pub mod scalar;
|
pub mod scalar;
|
||||||
pub mod vector;
|
pub mod vector;
|
||||||
@@ -28,7 +25,6 @@ pub enum Index {
|
|||||||
Auto,
|
Auto,
|
||||||
BTree(BTreeIndexBuilder),
|
BTree(BTreeIndexBuilder),
|
||||||
IvfPq(IvfPqIndexBuilder),
|
IvfPq(IvfPqIndexBuilder),
|
||||||
IvfHnswSq(IvfHnswSqIndexBuilder),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Builder for the create_index operation
|
/// Builder for the create_index operation
|
||||||
@@ -69,7 +65,6 @@ impl IndexBuilder {
|
|||||||
#[derive(Debug, Clone, PartialEq)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
pub enum IndexType {
|
pub enum IndexType {
|
||||||
IvfPq,
|
IvfPq,
|
||||||
IvfHnswSq,
|
|
||||||
BTree,
|
BTree,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -83,14 +83,10 @@ pub struct VectorIndexStatistics {
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct IvfPqIndexBuilder {
|
pub struct IvfPqIndexBuilder {
|
||||||
pub(crate) distance_type: DistanceType,
|
pub(crate) distance_type: DistanceType,
|
||||||
|
|
||||||
// IVF
|
|
||||||
pub(crate) num_partitions: Option<u32>,
|
pub(crate) num_partitions: Option<u32>,
|
||||||
|
pub(crate) num_sub_vectors: Option<u32>,
|
||||||
pub(crate) sample_rate: u32,
|
pub(crate) sample_rate: u32,
|
||||||
pub(crate) max_iterations: u32,
|
pub(crate) max_iterations: u32,
|
||||||
|
|
||||||
// PQ
|
|
||||||
pub(crate) num_sub_vectors: Option<u32>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for IvfPqIndexBuilder {
|
impl Default for IvfPqIndexBuilder {
|
||||||
@@ -205,124 +201,3 @@ pub(crate) fn suggested_num_sub_vectors(dim: u32) -> u32 {
|
|||||||
1
|
1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Builder for an IVF_HNSW_SQ index.
|
|
||||||
///
|
|
||||||
/// This index is a combination of IVF and HNSW.
|
|
||||||
/// The IVF part is the same as the IVF PQ index.
|
|
||||||
/// For each IVF partition, this builds a HNSW graph, the graph is used to
|
|
||||||
/// quickly find the closest vectors to a query vector.
|
|
||||||
///
|
|
||||||
/// The SQ (scalar quantizer) is used to compress the vectors,
|
|
||||||
/// each vector is mapped to a 8-bit integer vector, 4x compression ratio for float32 vector.
|
|
||||||
#[derive(Debug, Clone)]
|
|
||||||
pub struct IvfHnswSqIndexBuilder {
|
|
||||||
// IVF
|
|
||||||
pub(crate) distance_type: DistanceType,
|
|
||||||
pub(crate) num_partitions: Option<u32>,
|
|
||||||
pub(crate) sample_rate: u32,
|
|
||||||
pub(crate) max_iterations: u32,
|
|
||||||
|
|
||||||
// HNSW
|
|
||||||
pub(crate) m: u32,
|
|
||||||
pub(crate) ef_construction: u32,
|
|
||||||
// SQ
|
|
||||||
// TODO add num_bits for SQ after it supports another num_bits besides 8
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for IvfHnswSqIndexBuilder {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self {
|
|
||||||
distance_type: DistanceType::L2,
|
|
||||||
num_partitions: None,
|
|
||||||
sample_rate: 256,
|
|
||||||
max_iterations: 50,
|
|
||||||
m: 20,
|
|
||||||
ef_construction: 300,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IvfHnswSqIndexBuilder {
|
|
||||||
/// [DistanceType] to use to build the index.
|
|
||||||
///
|
|
||||||
/// Default value is [DistanceType::L2].
|
|
||||||
///
|
|
||||||
/// This is used when training the index to calculate the IVF partitions (vectors are
|
|
||||||
/// grouped in partitions with similar vectors according to this distance type)
|
|
||||||
///
|
|
||||||
/// The metric type used to train an index MUST match the metric type used to search the
|
|
||||||
/// index. Failure to do so will yield inaccurate results.
|
|
||||||
///
|
|
||||||
/// Now IVF_HNSW_SQ only supports L2 and Cosine distance types.
|
|
||||||
pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
|
|
||||||
self.distance_type = distance_type;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The number of IVF partitions to create.
|
|
||||||
///
|
|
||||||
/// This value should generally scale with the number of rows in the dataset. By default
|
|
||||||
/// the number of partitions is the square root of the number of rows.
|
|
||||||
///
|
|
||||||
/// If this value is too large then the first part of the search (picking the right partition)
|
|
||||||
/// will be slow. If this value is too small then the second part of the search (searching
|
|
||||||
/// within a partition) will be slow.
|
|
||||||
pub fn num_partitions(mut self, num_partitions: u32) -> Self {
|
|
||||||
self.num_partitions = Some(num_partitions);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The rate used to calculate the number of training vectors for kmeans and SQ.
|
|
||||||
///
|
|
||||||
/// When an IVF_HNSW_SQ index is trained, we need to calculate partitions and min/max value of vectors. These are groups
|
|
||||||
/// of vectors that are similar to each other. To do this we use an algorithm called kmeans.
|
|
||||||
///
|
|
||||||
/// Running kmeans on a large dataset can be slow. To speed this up we run kmeans on a
|
|
||||||
/// random sample of the data. This parameter controls the size of the sample. The total
|
|
||||||
/// number of vectors used to train the IVF is `sample_rate * num_partitions`.
|
|
||||||
///
|
|
||||||
/// The total number of vectors used to train the SQ is `sample_rate * 2^{num_bits}`.
|
|
||||||
///
|
|
||||||
/// Increasing this value might improve the quality of the index but in most cases the
|
|
||||||
/// default should be sufficient.
|
|
||||||
///
|
|
||||||
/// The default value is 256.
|
|
||||||
pub fn sample_rate(mut self, sample_rate: u32) -> Self {
|
|
||||||
self.sample_rate = sample_rate;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Max iterations to train kmeans.
|
|
||||||
///
|
|
||||||
/// When training an IVF index we use kmeans to calculate the partitions. This parameter
|
|
||||||
/// controls how many iterations of kmeans to run.
|
|
||||||
///
|
|
||||||
/// Increasing this might improve the quality of the index but in most cases the parameter
|
|
||||||
/// is unused because kmeans will converge with fewer iterations. The parameter is only
|
|
||||||
/// used in cases where kmeans does not appear to converge. In those cases it is unlikely
|
|
||||||
/// that setting this larger will lead to the index converging anyways.
|
|
||||||
///
|
|
||||||
/// The default value is 50.
|
|
||||||
pub fn max_iterations(mut self, max_iterations: u32) -> Self {
|
|
||||||
self.max_iterations = max_iterations;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The number of neighbors to select for each vector in the HNSW graph.
|
|
||||||
/// Bumping this number will increase the recall of the search but also increase the build/search time.
|
|
||||||
/// The default value is 20.
|
|
||||||
pub fn m(mut self, m: u32) -> Self {
|
|
||||||
self.m = m;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The number of candidates to evaluate during the construction of the HNSW graph.
|
|
||||||
/// Bumping this number will increase the recall of the search but also increase the build/search time.
|
|
||||||
/// This value should be not less than `ef` in the search phase.
|
|
||||||
/// The default value is 300.
|
|
||||||
pub fn ef_construction(mut self, ef_construction: u32) -> Self {
|
|
||||||
self.ef_construction = ef_construction;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -238,9 +238,6 @@ pub enum DistanceType {
|
|||||||
/// distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
/// distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
|
||||||
/// L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
/// L2 norm is 1), then dot distance is equivalent to the cosine distance.
|
||||||
Dot,
|
Dot,
|
||||||
/// Hamming distance. Hamming distance is a distance metric that measures
|
|
||||||
/// the number of positions at which the corresponding elements are different.
|
|
||||||
Hamming,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<DistanceType> for LanceDistanceType {
|
impl From<DistanceType> for LanceDistanceType {
|
||||||
@@ -249,7 +246,6 @@ impl From<DistanceType> for LanceDistanceType {
|
|||||||
DistanceType::L2 => Self::L2,
|
DistanceType::L2 => Self::L2,
|
||||||
DistanceType::Cosine => Self::Cosine,
|
DistanceType::Cosine => Self::Cosine,
|
||||||
DistanceType::Dot => Self::Dot,
|
DistanceType::Dot => Self::Dot,
|
||||||
DistanceType::Hamming => Self::Hamming,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -260,7 +256,6 @@ impl From<LanceDistanceType> for DistanceType {
|
|||||||
LanceDistanceType::L2 => Self::L2,
|
LanceDistanceType::L2 => Self::L2,
|
||||||
LanceDistanceType::Cosine => Self::Cosine,
|
LanceDistanceType::Cosine => Self::Cosine,
|
||||||
LanceDistanceType::Dot => Self::Dot,
|
LanceDistanceType::Dot => Self::Dot,
|
||||||
LanceDistanceType::Hamming => Self::Hamming,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,9 +23,12 @@ use arrow::datatypes::Float32Type;
|
|||||||
use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
use arrow_array::{RecordBatchIterator, RecordBatchReader};
|
||||||
use arrow_schema::{DataType, Field, Schema, SchemaRef};
|
use arrow_schema::{DataType, Field, Schema, SchemaRef};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
use chrono::Duration;
|
||||||
use lance::dataset::builder::DatasetBuilder;
|
use lance::dataset::builder::DatasetBuilder;
|
||||||
use lance::dataset::cleanup::RemovalStats;
|
use lance::dataset::cleanup::RemovalStats;
|
||||||
use lance::dataset::optimize::{compact_files, CompactionMetrics, IndexRemapperOptions};
|
use lance::dataset::optimize::{
|
||||||
|
compact_files, CompactionMetrics, CompactionOptions, IndexRemapperOptions,
|
||||||
|
};
|
||||||
use lance::dataset::scanner::{DatasetRecordBatchStream, Scanner};
|
use lance::dataset::scanner::{DatasetRecordBatchStream, Scanner};
|
||||||
pub use lance::dataset::ColumnAlteration;
|
pub use lance::dataset::ColumnAlteration;
|
||||||
pub use lance::dataset::NewColumnTransform;
|
pub use lance::dataset::NewColumnTransform;
|
||||||
@@ -35,11 +38,8 @@ use lance::dataset::{
|
|||||||
};
|
};
|
||||||
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
use lance::dataset::{MergeInsertBuilder as LanceMergeInsertBuilder, WhenNotMatchedBySource};
|
||||||
use lance::io::WrappingObjectStore;
|
use lance::io::WrappingObjectStore;
|
||||||
use lance_index::vector::hnsw::builder::HnswBuildParams;
|
|
||||||
use lance_index::vector::ivf::IvfBuildParams;
|
|
||||||
use lance_index::vector::sq::builder::SQBuildParams;
|
|
||||||
use lance_index::DatasetIndexExt;
|
|
||||||
use lance_index::IndexType;
|
use lance_index::IndexType;
|
||||||
|
use lance_index::{optimize::OptimizeOptions, DatasetIndexExt};
|
||||||
use log::info;
|
use log::info;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use snafu::whatever;
|
use snafu::whatever;
|
||||||
@@ -48,9 +48,7 @@ use crate::arrow::IntoArrow;
|
|||||||
use crate::connection::NoData;
|
use crate::connection::NoData;
|
||||||
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
||||||
use crate::error::{Error, Result};
|
use crate::error::{Error, Result};
|
||||||
use crate::index::vector::{
|
use crate::index::vector::{IvfPqIndexBuilder, VectorIndex, VectorIndexStatistics};
|
||||||
IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex, VectorIndexStatistics,
|
|
||||||
};
|
|
||||||
use crate::index::IndexConfig;
|
use crate::index::IndexConfig;
|
||||||
use crate::index::{
|
use crate::index::{
|
||||||
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
vector::{suggested_num_partitions, suggested_num_sub_vectors},
|
||||||
@@ -67,10 +65,6 @@ use self::merge::MergeInsertBuilder;
|
|||||||
pub(crate) mod dataset;
|
pub(crate) mod dataset;
|
||||||
pub mod merge;
|
pub mod merge;
|
||||||
|
|
||||||
pub use chrono::Duration;
|
|
||||||
pub use lance::dataset::optimize::CompactionOptions;
|
|
||||||
pub use lance_index::optimize::OptimizeOptions;
|
|
||||||
|
|
||||||
/// Defines the type of column
|
/// Defines the type of column
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub enum ColumnKind {
|
pub enum ColumnKind {
|
||||||
@@ -151,58 +145,22 @@ impl TableDefinition {
|
|||||||
///
|
///
|
||||||
/// By default, it optimizes everything, as [`OptimizeAction::All`].
|
/// By default, it optimizes everything, as [`OptimizeAction::All`].
|
||||||
pub enum OptimizeAction {
|
pub enum OptimizeAction {
|
||||||
/// Run all optimizations with default values
|
/// Run optimization on every, with default options.
|
||||||
All,
|
All,
|
||||||
/// Compacts files in the dataset
|
/// Compact files in the dataset
|
||||||
///
|
|
||||||
/// LanceDb uses a readonly filesystem for performance and safe concurrency. Every time
|
|
||||||
/// new data is added it will be added into new files. Small files
|
|
||||||
/// can hurt both read and write performance. Compaction will merge small files
|
|
||||||
/// into larger ones.
|
|
||||||
///
|
|
||||||
/// All operations that modify data (add, delete, update, merge insert, etc.) will create
|
|
||||||
/// new files. If these operations are run frequently then compaction should run frequently.
|
|
||||||
///
|
|
||||||
/// If these operations are never run (search only) then compaction is not necessary.
|
|
||||||
Compact {
|
Compact {
|
||||||
options: CompactionOptions,
|
options: CompactionOptions,
|
||||||
remap_options: Option<Arc<dyn IndexRemapperOptions>>,
|
remap_options: Option<Arc<dyn IndexRemapperOptions>>,
|
||||||
},
|
},
|
||||||
/// Prune old version of datasets
|
/// Prune old version of datasets.
|
||||||
///
|
|
||||||
/// Every change in LanceDb is additive. When data is removed from a dataset a new version is
|
|
||||||
/// created that doesn't contain the removed data. However, the old version, which does contain
|
|
||||||
/// the removed data, is left in place. This is necessary for consistency and concurrency and
|
|
||||||
/// also enables time travel functionality like the ability to checkout an older version of the
|
|
||||||
/// dataset to undo changes.
|
|
||||||
///
|
|
||||||
/// Over time, these old versions can consume a lot of disk space. The prune operation will
|
|
||||||
/// remove versions of the dataset that are older than a certain age. This will free up the
|
|
||||||
/// space used by that old data.
|
|
||||||
///
|
|
||||||
/// Once a version is pruned it can no longer be checked out.
|
|
||||||
Prune {
|
Prune {
|
||||||
/// The duration of time to keep versions of the dataset.
|
/// The duration of time to keep versions of the dataset.
|
||||||
older_than: Option<Duration>,
|
older_than: Duration,
|
||||||
/// Because they may be part of an in-progress transaction, files newer than 7 days old are not deleted by default.
|
/// Because they may be part of an in-progress transaction, files newer than 7 days old are not deleted by default.
|
||||||
/// If you are sure that there are no in-progress transactions, then you can set this to True to delete all files older than `older_than`.
|
/// If you are sure that there are no in-progress transactions, then you can set this to True to delete all files older than `older_than`.
|
||||||
delete_unverified: Option<bool>,
|
delete_unverified: Option<bool>,
|
||||||
},
|
},
|
||||||
/// Optimize the indices
|
/// Optimize index.
|
||||||
///
|
|
||||||
/// This operation optimizes all indices in the table. When new data is added to LanceDb
|
|
||||||
/// it is not added to the indices. However, it can still turn up in searches because the search
|
|
||||||
/// function will scan both the indexed data and the unindexed data in parallel. Over time, the
|
|
||||||
/// unindexed data can become large enough that the search performance is slow. This operation
|
|
||||||
/// will add the unindexed data to the indices without rerunning the full index creation process.
|
|
||||||
///
|
|
||||||
/// Optimizing an index is faster than re-training the index but it does not typically adjust the
|
|
||||||
/// underlying model relied upon by the index. This can eventually lead to poor search accuracy
|
|
||||||
/// and so users may still want to occasionally retrain the index after adding a large amount of
|
|
||||||
/// data.
|
|
||||||
///
|
|
||||||
/// For example, when using IVF, an index will create clusters. Optimizing an index assigns unindexed
|
|
||||||
/// data to the existing clusters, but it does not move the clusters or create new clusters.
|
|
||||||
Index(OptimizeOptions),
|
Index(OptimizeOptions),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -354,7 +312,6 @@ impl UpdateBuilder {
|
|||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
pub(crate) trait TableInternal: std::fmt::Display + std::fmt::Debug + Send + Sync {
|
||||||
#[allow(dead_code)]
|
|
||||||
fn as_any(&self) -> &dyn std::any::Any;
|
fn as_any(&self) -> &dyn std::any::Any;
|
||||||
/// Cast as [`NativeTable`], or return None it if is not a [`NativeTable`].
|
/// Cast as [`NativeTable`], or return None it if is not a [`NativeTable`].
|
||||||
fn as_native(&self) -> Option<&NativeTable>;
|
fn as_native(&self) -> Option<&NativeTable>;
|
||||||
@@ -794,30 +751,10 @@ impl Table {
|
|||||||
|
|
||||||
/// Optimize the on-disk data and indices for better performance.
|
/// Optimize the on-disk data and indices for better performance.
|
||||||
///
|
///
|
||||||
/// Modeled after ``VACUUM`` in PostgreSQL.
|
|
||||||
///
|
|
||||||
/// Optimization is discussed in more detail in the [OptimizeAction] documentation
|
|
||||||
/// and covers three operations:
|
|
||||||
///
|
|
||||||
/// * Compaction: Merges small files into larger ones
|
|
||||||
/// * Prune: Removes old versions of the dataset
|
|
||||||
/// * Index: Optimizes the indices, adding new data to existing indices
|
|
||||||
///
|
|
||||||
/// <section class="warning">Experimental API</section>
|
/// <section class="warning">Experimental API</section>
|
||||||
///
|
///
|
||||||
/// The optimization process is undergoing active development and may change.
|
/// Modeled after ``VACUUM`` in PostgreSQL.
|
||||||
/// Our goal with these changes is to improve the performance of optimization and
|
/// Not all implementations support explicit optimization.
|
||||||
/// reduce the complexity.
|
|
||||||
///
|
|
||||||
/// That being said, it is essential today to run optimize if you want the best
|
|
||||||
/// performance. It should be stable and safe to use in production, but it our
|
|
||||||
/// hope that the API may be simplified (or not even need to be called) in the future.
|
|
||||||
///
|
|
||||||
/// The frequency an application shoudl call optimize is based on the frequency of
|
|
||||||
/// data modifications. If data is frequently added, deleted, or updated then
|
|
||||||
/// optimize should be run frequently. A good rule of thumb is to run optimize if
|
|
||||||
/// you have added or modified 100,000 or more records or run more than 20 data
|
|
||||||
/// modification operations.
|
|
||||||
pub async fn optimize(&self, action: OptimizeAction) -> Result<OptimizeStats> {
|
pub async fn optimize(&self, action: OptimizeAction) -> Result<OptimizeStats> {
|
||||||
self.inner.optimize(action).await
|
self.inner.optimize(action).await
|
||||||
}
|
}
|
||||||
@@ -1301,6 +1238,7 @@ impl NativeTable {
|
|||||||
num_partitions as usize,
|
num_partitions as usize,
|
||||||
/*num_bits=*/ 8,
|
/*num_bits=*/ 8,
|
||||||
num_sub_vectors as usize,
|
num_sub_vectors as usize,
|
||||||
|
false,
|
||||||
index.distance_type.into(),
|
index.distance_type.into(),
|
||||||
index.max_iterations as usize,
|
index.max_iterations as usize,
|
||||||
);
|
);
|
||||||
@@ -1316,57 +1254,6 @@ impl NativeTable {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_ivf_hnsw_sq_index(
|
|
||||||
&self,
|
|
||||||
index: IvfHnswSqIndexBuilder,
|
|
||||||
field: &Field,
|
|
||||||
replace: bool,
|
|
||||||
) -> Result<()> {
|
|
||||||
if !Self::supported_vector_data_type(field.data_type()) {
|
|
||||||
return Err(Error::InvalidInput {
|
|
||||||
message: format!(
|
|
||||||
"An IVF HNSW SQ index cannot be created on the column `{}` which has data type {}",
|
|
||||||
field.name(),
|
|
||||||
field.data_type()
|
|
||||||
),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let num_partitions = if let Some(n) = index.num_partitions {
|
|
||||||
n
|
|
||||||
} else {
|
|
||||||
suggested_num_partitions(self.count_rows(None).await?)
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
|
||||||
let mut ivf_params = IvfBuildParams::new(num_partitions as usize);
|
|
||||||
ivf_params.sample_rate = index.sample_rate as usize;
|
|
||||||
ivf_params.max_iters = index.max_iterations as usize;
|
|
||||||
let hnsw_params = HnswBuildParams::default()
|
|
||||||
.num_edges(index.m as usize)
|
|
||||||
.ef_construction(index.ef_construction as usize);
|
|
||||||
let sq_params = SQBuildParams {
|
|
||||||
sample_rate: index.sample_rate as usize,
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
let lance_idx_params = lance::index::vector::VectorIndexParams::with_ivf_hnsw_sq_params(
|
|
||||||
index.distance_type.into(),
|
|
||||||
ivf_params,
|
|
||||||
hnsw_params,
|
|
||||||
sq_params,
|
|
||||||
);
|
|
||||||
dataset
|
|
||||||
.create_index(
|
|
||||||
&[field.name()],
|
|
||||||
IndexType::Vector,
|
|
||||||
None,
|
|
||||||
&lance_idx_params,
|
|
||||||
replace,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn create_auto_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
async fn create_auto_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||||
if Self::supported_vector_data_type(field.data_type()) {
|
if Self::supported_vector_data_type(field.data_type()) {
|
||||||
self.create_ivf_pq_index(IvfPqIndexBuilder::default(), field, opts.replace)
|
self.create_ivf_pq_index(IvfPqIndexBuilder::default(), field, opts.replace)
|
||||||
@@ -1610,10 +1497,6 @@ impl TableInternal for NativeTable {
|
|||||||
Index::Auto => self.create_auto_index(field, opts).await,
|
Index::Auto => self.create_auto_index(field, opts).await,
|
||||||
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
||||||
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
||||||
Index::IvfHnswSq(ivf_hnsw_sq) => {
|
|
||||||
self.create_ivf_hnsw_sq_index(ivf_hnsw_sq, field, opts.replace)
|
|
||||||
.await
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1709,7 +1592,7 @@ impl TableInternal for NativeTable {
|
|||||||
.compaction;
|
.compaction;
|
||||||
stats.prune = self
|
stats.prune = self
|
||||||
.optimize(OptimizeAction::Prune {
|
.optimize(OptimizeAction::Prune {
|
||||||
older_than: None,
|
older_than: Duration::try_days(7).unwrap(),
|
||||||
delete_unverified: None,
|
delete_unverified: None,
|
||||||
})
|
})
|
||||||
.await?
|
.await?
|
||||||
@@ -1728,10 +1611,7 @@ impl TableInternal for NativeTable {
|
|||||||
delete_unverified,
|
delete_unverified,
|
||||||
} => {
|
} => {
|
||||||
stats.prune = Some(
|
stats.prune = Some(
|
||||||
self.cleanup_old_versions(
|
self.cleanup_old_versions(older_than, delete_unverified)
|
||||||
older_than.unwrap_or(Duration::try_days(7).expect("valid delta")),
|
|
||||||
delete_unverified,
|
|
||||||
)
|
|
||||||
.await?,
|
.await?,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -2477,102 +2357,6 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_create_index_ivf_hnsw_sq() {
|
|
||||||
use arrow_array::RecordBatch;
|
|
||||||
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
|
|
||||||
use rand;
|
|
||||||
use std::iter::repeat_with;
|
|
||||||
|
|
||||||
use arrow_array::Float32Array;
|
|
||||||
|
|
||||||
let tmp_dir = tempdir().unwrap();
|
|
||||||
let uri = tmp_dir.path().to_str().unwrap();
|
|
||||||
let conn = connect(uri).execute().await.unwrap();
|
|
||||||
|
|
||||||
let dimension = 16;
|
|
||||||
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
|
|
||||||
"embeddings",
|
|
||||||
DataType::FixedSizeList(
|
|
||||||
Arc::new(Field::new("item", DataType::Float32, true)),
|
|
||||||
dimension,
|
|
||||||
),
|
|
||||||
false,
|
|
||||||
)]));
|
|
||||||
|
|
||||||
let mut rng = rand::thread_rng();
|
|
||||||
let float_arr = Float32Array::from(
|
|
||||||
repeat_with(|| rng.gen::<f32>())
|
|
||||||
.take(512 * dimension as usize)
|
|
||||||
.collect::<Vec<f32>>(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let vectors = Arc::new(create_fixed_size_list(float_arr, dimension).unwrap());
|
|
||||||
let batches = RecordBatchIterator::new(
|
|
||||||
vec![RecordBatch::try_new(schema.clone(), vec![vectors.clone()]).unwrap()]
|
|
||||||
.into_iter()
|
|
||||||
.map(Ok),
|
|
||||||
schema,
|
|
||||||
);
|
|
||||||
|
|
||||||
let table = conn.create_table("test", batches).execute().await.unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
table
|
|
||||||
.as_native()
|
|
||||||
.unwrap()
|
|
||||||
.count_indexed_rows("my_index")
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
None
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
table
|
|
||||||
.as_native()
|
|
||||||
.unwrap()
|
|
||||||
.count_unindexed_rows("my_index")
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
None
|
|
||||||
);
|
|
||||||
|
|
||||||
let index = IvfHnswSqIndexBuilder::default();
|
|
||||||
table
|
|
||||||
.create_index(&["embeddings"], Index::IvfHnswSq(index))
|
|
||||||
.execute()
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let index_configs = table.list_indices().await.unwrap();
|
|
||||||
assert_eq!(index_configs.len(), 1);
|
|
||||||
let index = index_configs.into_iter().next().unwrap();
|
|
||||||
assert_eq!(index.index_type, crate::index::IndexType::IvfPq);
|
|
||||||
assert_eq!(index.columns, vec!["embeddings".to_string()]);
|
|
||||||
assert_eq!(table.count_rows(None).await.unwrap(), 512);
|
|
||||||
assert_eq!(table.name(), "test");
|
|
||||||
|
|
||||||
let indices = table.as_native().unwrap().load_indices().await.unwrap();
|
|
||||||
let index_uuid = &indices[0].index_uuid;
|
|
||||||
assert_eq!(
|
|
||||||
table
|
|
||||||
.as_native()
|
|
||||||
.unwrap()
|
|
||||||
.count_indexed_rows(index_uuid)
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
Some(512)
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
table
|
|
||||||
.as_native()
|
|
||||||
.unwrap()
|
|
||||||
.count_unindexed_rows(index_uuid)
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
Some(0)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
|
fn create_fixed_size_list<T: Array>(values: T, list_size: i32) -> Result<FixedSizeListArray> {
|
||||||
let list_type = DataType::FixedSizeList(
|
let list_type = DataType::FixedSizeList(
|
||||||
Arc::new(Field::new("item", values.data_type().clone(), true)),
|
Arc::new(Field::new("item", values.data_type().clone(), true)),
|
||||||
|
|||||||
Reference in New Issue
Block a user