mirror of
https://github.com/lancedb/lancedb.git
synced 2025-12-24 05:49:57 +00:00
Compare commits
30 Commits
python-v0.
...
python-v0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
79a1cd60ee | ||
|
|
88807a59a4 | ||
|
|
e0e7e01ea8 | ||
|
|
a416ebc11d | ||
|
|
f941054baf | ||
|
|
1a81c46505 | ||
|
|
82b25a71e9 | ||
|
|
13c613d45f | ||
|
|
e07389a36c | ||
|
|
e7e9e80b1d | ||
|
|
247fb58400 | ||
|
|
504bdc471c | ||
|
|
d617cdef4a | ||
|
|
356d7046fd | ||
|
|
48e5caabda | ||
|
|
d6cc68f671 | ||
|
|
55eacfa685 | ||
|
|
222e3264ab | ||
|
|
13505026cb | ||
|
|
b0800b4b71 | ||
|
|
1befebf614 | ||
|
|
1ab60fae7f | ||
|
|
e921c90c1b | ||
|
|
05a4ea646a | ||
|
|
ebbeeff4e0 | ||
|
|
407ca53f92 | ||
|
|
ff71d7e552 | ||
|
|
2261eb95a0 | ||
|
|
5b397e410b | ||
|
|
b5a39bffec |
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.22.1-beta.1"
|
||||
current_version = "0.22.2-beta.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
3
.github/workflows/docs.yml
vendored
3
.github/workflows/docs.yml
vendored
@@ -56,8 +56,9 @@ jobs:
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'npm'
|
||||
cache-dependency-path: docs/package-lock.json
|
||||
- name: Install node dependencies
|
||||
working-directory: node
|
||||
working-directory: nodejs
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y protobuf-compiler libssl-dev
|
||||
|
||||
2
.github/workflows/nodejs.yml
vendored
2
.github/workflows/nodejs.yml
vendored
@@ -116,7 +116,7 @@ jobs:
|
||||
set -e
|
||||
npm ci
|
||||
npm run docs
|
||||
if ! git diff --exit-code -- . ':(exclude)Cargo.lock'; then
|
||||
if ! git diff --exit-code -- ../ ':(exclude)Cargo.lock'; then
|
||||
echo "Docs need to be updated"
|
||||
echo "Run 'npm run docs', fix any warnings, and commit the changes."
|
||||
exit 1
|
||||
|
||||
4
.github/workflows/pypi-publish.yml
vendored
4
.github/workflows/pypi-publish.yml
vendored
@@ -56,7 +56,7 @@ jobs:
|
||||
pypi_token: ${{ secrets.LANCEDB_PYPI_API_TOKEN }}
|
||||
fury_token: ${{ secrets.FURY_TOKEN }}
|
||||
mac:
|
||||
timeout-minutes: 60
|
||||
timeout-minutes: 90
|
||||
runs-on: ${{ matrix.config.runner }}
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
- target: x86_64-apple-darwin
|
||||
runner: macos-13
|
||||
- target: aarch64-apple-darwin
|
||||
runner: macos-14
|
||||
runner: warp-macos-14-arm64-6x
|
||||
env:
|
||||
MACOSX_DEPLOYMENT_TARGET: 10.15
|
||||
steps:
|
||||
|
||||
10
.github/workflows/rust.yml
vendored
10
.github/workflows/rust.yml
vendored
@@ -96,6 +96,7 @@ jobs:
|
||||
# Need up-to-date compilers for kernels
|
||||
CC: clang-18
|
||||
CXX: clang++-18
|
||||
GH_TOKEN: ${{ secrets.SOPHON_READ_TOKEN }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -117,15 +118,14 @@ jobs:
|
||||
sudo chmod 600 /swapfile
|
||||
sudo mkswap /swapfile
|
||||
sudo swapon /swapfile
|
||||
- name: Start S3 integration test environment
|
||||
working-directory: .
|
||||
run: docker compose up --detach --wait
|
||||
- name: Build
|
||||
run: cargo build --all-features --tests --locked --examples
|
||||
- name: Run tests
|
||||
run: cargo test --all-features --locked
|
||||
- name: Run feature tests
|
||||
run: make -C ./lancedb feature-tests
|
||||
- name: Run examples
|
||||
run: cargo run --example simple --locked
|
||||
- name: Run remote tests
|
||||
run: make -C ./lancedb remote-tests
|
||||
|
||||
macos:
|
||||
timeout-minutes: 30
|
||||
|
||||
26
.github/workflows/trigger-vectordb-recipes.yml
vendored
26
.github/workflows/trigger-vectordb-recipes.yml
vendored
@@ -1,26 +0,0 @@
|
||||
name: Trigger vectordb-recipers workflow
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/trigger-vectordb-recipes.yml
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Trigger vectordb-recipes workflow
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
github-token: ${{ secrets.VECTORDB_RECIPES_ACTION_TOKEN }}
|
||||
script: |
|
||||
const result = await github.rest.actions.createWorkflowDispatch({
|
||||
owner: 'lancedb',
|
||||
repo: 'vectordb-recipes',
|
||||
workflow_id: 'examples-test.yml',
|
||||
ref: 'main'
|
||||
});
|
||||
console.log(result);
|
||||
771
Cargo.lock
generated
771
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
32
Cargo.toml
32
Cargo.toml
@@ -15,14 +15,15 @@ categories = ["database-implementations"]
|
||||
rust-version = "1.78.0"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = { "version" = "=0.35.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.35.0", default-features = false, "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.35.0", "tag" = "v0.35.0-beta.4", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance = { "version" = "=0.37.0", default-features = false, "features" = ["dynamodb"], "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.37.0", default-features = false, "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-namespace = "0.0.16"
|
||||
# Note that this one does not include pyarrow
|
||||
arrow = { version = "55.1", optional = false }
|
||||
arrow-array = "55.1"
|
||||
@@ -30,7 +31,6 @@ arrow-data = "55.1"
|
||||
arrow-ipc = "55.1"
|
||||
arrow-ord = "55.1"
|
||||
arrow-schema = "55.1"
|
||||
arrow-arith = "55.1"
|
||||
arrow-cast = "55.1"
|
||||
async-trait = "0"
|
||||
datafusion = { version = "49.0", default-features = false }
|
||||
@@ -51,7 +51,6 @@ pin-project = "1.0.7"
|
||||
snafu = "0.8"
|
||||
url = "2"
|
||||
num-traits = "0.2"
|
||||
rand = "0.9"
|
||||
regex = "1.10"
|
||||
lazy_static = "1"
|
||||
semver = "1.0.25"
|
||||
@@ -59,7 +58,16 @@ crunchy = "0.2.4"
|
||||
# Temporary pins to work around downstream issues
|
||||
# https://github.com/apache/arrow-rs/commit/2fddf85afcd20110ce783ed5b4cdeb82293da30b
|
||||
chrono = "=0.4.41"
|
||||
# https://github.com/RustCrypto/formats/issues/1684
|
||||
base64ct = "=1.6.0"
|
||||
# Workaround for: https://github.com/Lokathor/bytemuck/issues/306
|
||||
bytemuck_derive = ">=1.8.1, <1.9.0"
|
||||
|
||||
[patch.crates-io]
|
||||
# Force to use the same lance version as the rest of the project to avoid duplicate dependencies
|
||||
lance = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-io = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-index = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-linalg = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-table = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-testing = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-datafusion = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
lance-encoding = { "version" = "=0.37.0", "tag" = "v0.37.1-beta.1", "git" = "https://github.com/lancedb/lance.git" }
|
||||
|
||||
4
ci/create_lancedb_test_connection.sh
Executable file
4
ci/create_lancedb_test_connection.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export RUST_LOG=info
|
||||
exec ./lancedb server --port 0 --sql-port 0 --data-dir "${1}"
|
||||
18
ci/run_with_docker_compose.sh
Executable file
18
ci/run_with_docker_compose.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#
|
||||
# A script for running the given command together with a docker compose environment.
|
||||
#
|
||||
|
||||
# Bring down the docker setup once the command is done running.
|
||||
tear_down() {
|
||||
docker compose -p fixture down
|
||||
}
|
||||
trap tear_down EXIT
|
||||
|
||||
set +xe
|
||||
|
||||
# Clean up any existing docker setup and bring up a new one.
|
||||
docker compose -p fixture up --detach --wait || exit 1
|
||||
|
||||
"${@}"
|
||||
68
ci/run_with_test_connection.sh
Executable file
68
ci/run_with_test_connection.sh
Executable file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#
|
||||
# A script for running the given command together with the lancedb cli.
|
||||
#
|
||||
|
||||
die() {
|
||||
echo $?
|
||||
exit 1
|
||||
}
|
||||
|
||||
check_command_exists() {
|
||||
command="${1}"
|
||||
which ${command} &> /dev/null || \
|
||||
die "Unable to locate command: ${command}. Did you install it?"
|
||||
}
|
||||
|
||||
if [[ ! -e ./lancedb ]]; then
|
||||
if [[ -v SOPHON_READ_TOKEN ]]; then
|
||||
INPUT="lancedb-linux-x64"
|
||||
gh release \
|
||||
--repo lancedb/lancedb \
|
||||
download ci-support-binaries \
|
||||
--pattern "${INPUT}" \
|
||||
|| die "failed to fetch cli."
|
||||
check_command_exists openssl
|
||||
openssl enc -aes-256-cbc \
|
||||
-d -pbkdf2 \
|
||||
-pass "env:SOPHON_READ_TOKEN" \
|
||||
-in "${INPUT}" \
|
||||
-out ./lancedb-linux-x64.tar.gz \
|
||||
|| die "openssl failed"
|
||||
TARGET="${INPUT}.tar.gz"
|
||||
else
|
||||
ARCH="x64"
|
||||
if [[ $OSTYPE == 'darwin'* ]]; then
|
||||
UNAME=$(uname -m)
|
||||
if [[ $UNAME == 'arm64' ]]; then
|
||||
ARCH='arm64'
|
||||
fi
|
||||
OSTYPE="macos"
|
||||
elif [[ $OSTYPE == 'linux'* ]]; then
|
||||
if [[ $UNAME == 'aarch64' ]]; then
|
||||
ARCH='arm64'
|
||||
fi
|
||||
OSTYPE="linux"
|
||||
else
|
||||
die "unknown OSTYPE: $OSTYPE"
|
||||
fi
|
||||
|
||||
check_command_exists gh
|
||||
TARGET="lancedb-${OSTYPE}-${ARCH}.tar.gz"
|
||||
gh release \
|
||||
--repo lancedb/sophon \
|
||||
download lancedb-cli-v0.0.3 \
|
||||
--pattern "${TARGET}" \
|
||||
|| die "failed to fetch cli."
|
||||
fi
|
||||
|
||||
check_command_exists tar
|
||||
tar xvf "${TARGET}" || die "tar failed."
|
||||
[[ -e ./lancedb ]] || die "failed to extract lancedb."
|
||||
fi
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
|
||||
export CREATE_LANCEDB_TEST_CONNECTION_SCRIPT="${SCRIPT_DIR}/create_lancedb_test_connection.sh"
|
||||
|
||||
"${@}"
|
||||
@@ -1,4 +1,5 @@
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
|
||||
@@ -18,8 +19,12 @@ def run_command(command: str) -> str:
|
||||
|
||||
def get_latest_stable_version() -> str:
|
||||
version_line = run_command("cargo info lance | grep '^version:'")
|
||||
version = version_line.split(" ")[1].strip()
|
||||
return version
|
||||
# Example output: "version: 0.35.0 (latest 0.37.0)"
|
||||
match = re.search(r'\(latest ([0-9.]+)\)', version_line)
|
||||
if match:
|
||||
return match.group(1)
|
||||
# Fallback: use the first version after 'version:'
|
||||
return version_line.split("version:")[1].split()[0].strip()
|
||||
|
||||
|
||||
def get_latest_preview_version() -> str:
|
||||
@@ -112,7 +117,7 @@ def update_cargo_toml(line_updater):
|
||||
lance_line = ""
|
||||
is_parsing_lance_line = False
|
||||
for line in lines:
|
||||
if line.startswith("lance"):
|
||||
if line.startswith("lance") and not line.startswith("lance-namespace"):
|
||||
# Check if this is a single-line or multi-line entry
|
||||
# Single-line entries either:
|
||||
# 1. End with } (complete inline table)
|
||||
|
||||
@@ -70,6 +70,22 @@ plugins:
|
||||
- mkdocs-jupyter
|
||||
- render_swagger:
|
||||
allow_arbitrary_locations: true
|
||||
- redirects:
|
||||
redirect_maps:
|
||||
# Redirect the home page and other top-level markdown files. This enables maximum SEO benefit
|
||||
# other sub-pages are handled by the ingected js in overrides/partials/header.html
|
||||
'index.md': 'https://lancedb.com/docs/'
|
||||
'guides/tables.md': 'https://lancedb.com/docs/tables/'
|
||||
'ann_indexes.md': 'https://lancedb.com/docs/indexing/'
|
||||
'basic.md': 'https://lancedb.com/docs/quickstart/'
|
||||
'faq.md': 'https://lancedb.com/docs/faq/'
|
||||
'embeddings/understanding_embeddings.md': 'https://lancedb.com/docs/embedding/'
|
||||
'integrations.md': 'https://lancedb.com/docs/integrations/'
|
||||
'examples.md': 'https://lancedb.com/docs/tutorials/'
|
||||
'concepts/vector_search.md': 'https://lancedb.com/docs/search/vector-search/'
|
||||
'troubleshooting.md': 'https://lancedb.com/docs/troubleshooting/'
|
||||
|
||||
|
||||
|
||||
markdown_extensions:
|
||||
- admonition
|
||||
@@ -386,4 +402,4 @@ extra:
|
||||
- icon: fontawesome/brands/x-twitter
|
||||
link: https://twitter.com/lancedb
|
||||
- icon: fontawesome/brands/linkedin
|
||||
link: https://www.linkedin.com/company/lancedb
|
||||
link: https://www.linkedin.com/company/lancedb
|
||||
@@ -19,7 +19,13 @@
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
IN THE SOFTWARE.
|
||||
-->
|
||||
|
||||
<div id="deprecation-banner" style="background-color: #f8d7da; color: #721c24; padding: 1em; text-align: center;">
|
||||
<p style="margin: 0; font-size: 1.1em;">
|
||||
<strong>This documentation site is deprecated.</strong>
|
||||
Please visit our new documentation site at <a href="https://lancedb.com/docs" style="color: #721c24; text-decoration: underline;">
|
||||
lancedb.com/docs</a> for the latest information.
|
||||
</p>
|
||||
</div>
|
||||
{% set class = "md-header" %}
|
||||
{% if "navigation.tabs.sticky" in features %}
|
||||
{% set class = class ~ " md-header--shadow md-header--lifted" %}
|
||||
@@ -150,9 +156,9 @@
|
||||
|
||||
<div style="margin-left: 10px; margin-right: 5px;">
|
||||
<a href="https://discord.com/invite/zMM32dvNtd" target="_blank" rel="noopener noreferrer">
|
||||
<svg fill="#FFFFFF" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 50 50" width="25px" height="25px"><path d="M 41.625 10.769531 C 37.644531 7.566406 31.347656 7.023438 31.078125 7.003906 C 30.660156 6.96875 30.261719 7.203125 30.089844 7.589844 C 30.074219 7.613281 29.9375 7.929688 29.785156 8.421875 C 32.417969 8.867188 35.652344 9.761719 38.578125 11.578125 C 39.046875 11.867188 39.191406 12.484375 38.902344 12.953125 C 38.710938 13.261719 38.386719 13.429688 38.050781 13.429688 C 37.871094 13.429688 37.6875 13.378906 37.523438 13.277344 C 32.492188 10.15625 26.210938 10 25 10 C 23.789063 10 17.503906 10.15625 12.476563 13.277344 C 12.007813 13.570313 11.390625 13.425781 11.101563 12.957031 C 10.808594 12.484375 10.953125 11.871094 11.421875 11.578125 C 14.347656 9.765625 17.582031 8.867188 20.214844 8.425781 C 20.0625 7.929688 19.925781 7.617188 19.914063 7.589844 C 19.738281 7.203125 19.34375 6.960938 18.921875 7.003906 C 18.652344 7.023438 12.355469 7.566406 8.320313 10.8125 C 6.214844 12.761719 2 24.152344 2 34 C 2 34.175781 2.046875 34.34375 2.132813 34.496094 C 5.039063 39.605469 12.972656 40.941406 14.78125 41 C 14.789063 41 14.800781 41 14.8125 41 C 15.132813 41 15.433594 40.847656 15.621094 40.589844 L 17.449219 38.074219 C 12.515625 36.800781 9.996094 34.636719 9.851563 34.507813 C 9.4375 34.144531 9.398438 33.511719 9.765625 33.097656 C 10.128906 32.683594 10.761719 32.644531 11.175781 33.007813 C 11.234375 33.0625 15.875 37 25 37 C 34.140625 37 38.78125 33.046875 38.828125 33.007813 C 39.242188 32.648438 39.871094 32.683594 40.238281 33.101563 C 40.601563 33.515625 40.5625 34.144531 40.148438 34.507813 C 40.003906 34.636719 37.484375 36.800781 32.550781 38.074219 L 34.378906 40.589844 C 34.566406 40.847656 34.867188 41 35.1875 41 C 35.199219 41 35.210938 41 35.21875 41 C 37.027344 40.941406 44.960938 39.605469 47.867188 34.496094 C 47.953125 34.34375 48 34.175781 48 34 C 48 24.152344 43.785156 12.761719 41.625 10.769531 Z M 18.5 30 C 16.566406 30 15 28.210938 15 26 C 15 23.789063 16.566406 22 18.5 22 C 20.433594 22 22 23.789063 22 26 C 22 28.210938 20.433594 30 18.5 30 Z M 31.5 30 C 29.566406 30 28 28.210938 28 26 C 28 23.789063 29.566406 22 31.5 22 C 33.433594 22 35 23.789063 35 26 C 35 28.210938 33.433594 30 31.5 30 Z"/></svg>
|
||||
</a>
|
||||
</div>
|
||||
<svg fill="#FFFFFF" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 50 50" width="25px" height="25px"><path d="M 41.625 10.769531 C 37.644531 7.566406 31.347656 7.023438 31.078125 7.003906 C 30.660156 6.96875 30.261719 7.203125 30.089844 7.589844 C 30.074219 7.613281 29.9375 7.929688 29.785156 8.421875 C 32.417969 8.867188 35.652344 9.761719 38.578125 11.578125 C 39.046875 11.867188 39.191406 12.484375 38.902344 12.953125 C 38.710938 13.261719 38.386719 13.429688 38.050781 13.429688 C 37.871094 13.429688 37.6875 13.378906 37.523438 13.277344 C 32.492188 10.15625 26.210938 10 25 10 C 23.789063 10 17.503906 10.15625 12.476563 13.277344 C 12.007813 13.570313 11.390625 13.425781 11.101563 12.957031 C 10.808594 12.484375 10.953125 11.871094 11.421875 11.578125 C 14.347656 9.765625 17.582031 8.867188 20.214844 8.425781 C 20.0625 7.929688 19.925781 7.617188 19.914063 7.589844 C 19.738281 7.203125 19.34375 6.960938 18.921875 7.003906 C 18.652344 7.023438 12.355469 7.566406 8.320313 10.8125 C 6.214844 12.761719 2 24.152344 2 34 C 2 34.175781 2.046875 34.34375 2.132813 34.496094 C 5.039063 39.605469 12.972656 40.941406 14.78125 41 C 14.789063 41 14.800781 41 14.8125 41 C 15.132813 41 15.433594 40.847656 15.621094 40.589844 L 17.449219 38.074219 C 12.515625 36.800781 9.996094 34.636719 9.851563 34.507813 C 9.4375 34.144531 9.398438 33.511719 9.765625 33.097656 C 10.128906 32.683594 10.761719 32.644531 11.175781 33.007813 C 11.234375 33.0625 15.875 37 25 37 C 34.140625 37 38.78125 33.046875 38.828125 33.007813 C 39.242188 32.648438 39.871094 32.683594 40.238281 33.101563 C 40.601563 33.515625 40.5625 34.144531 40.148438 34.507813 C 40.003906 34.636719 37.484375 36.800781 32.550781 38.074219 L 34.378906 40.589844 C 34.566406 40.847656 34.867188 41 35.1875 41 C 35.199219 41 35.210938 41 35.21875 41 C 37.027344 40.941406 44.960938 39.605469 47.867188 34.496094 C 47.953125 34.34375 48 34.175781 48 34 C 48 24.152344 43.785156 12.761719 41.625 10.769531 Z M 18.5 30 C 16.566406 30 15 28.210938 15 26 C 15 23.789063 16.566406 22 18.5 22 C 20.433594 22 22 23.789063 22 26 C 22 28.210938 20.433594 30 18.5 30 Z M 31.5 30 C 29.566406 30 28 28.210938 28 26 C 28 23.789063 29.566406 22 31.5 22 C 33.433594 22 35 23.789063 35 26 C 35 28.210938 33.433594 30 31.5 30 Z"/></svg>
|
||||
</a>
|
||||
</div>
|
||||
<div style="margin-left: 5px; margin-right: 5px;">
|
||||
<a href="https://twitter.com/lancedb" target="_blank" rel="noopener noreferrer">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0,0,256,256" width="25px" height="25px" fill-rule="nonzero"><g fill-opacity="0" fill="#ffffff" fill-rule="nonzero" stroke="none" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="10" stroke-dasharray="" stroke-dashoffset="0" font-family="none" font-weight="none" font-size="none" text-anchor="none" style="mix-blend-mode: normal"><path d="M0,256v-256h256v256z" id="bgRectangle"></path></g><g fill="#ffffff" fill-rule="nonzero" stroke="none" stroke-width="1" stroke-linecap="butt" stroke-linejoin="miter" stroke-miterlimit="10" stroke-dasharray="" stroke-dashoffset="0" font-family="none" font-weight="none" font-size="none" text-anchor="none" style="mix-blend-mode: normal"><g transform="scale(4,4)"><path d="M57,17.114c-1.32,1.973 -2.991,3.707 -4.916,5.097c0.018,0.423 0.028,0.847 0.028,1.274c0,13.013 -9.902,28.018 -28.016,28.018c-5.562,0 -12.81,-1.948 -15.095,-4.423c0.772,0.092 1.556,0.138 2.35,0.138c4.615,0 8.861,-1.575 12.23,-4.216c-4.309,-0.079 -7.946,-2.928 -9.199,-6.84c1.96,0.308 4.447,-0.17 4.447,-0.17c0,0 -7.7,-1.322 -7.899,-9.779c2.226,1.291 4.46,1.231 4.46,1.231c0,0 -4.441,-2.734 -4.379,-8.195c0.037,-3.221 1.331,-4.953 1.331,-4.953c8.414,10.361 20.298,10.29 20.298,10.29c0,0 -0.255,-1.471 -0.255,-2.243c0,-5.437 4.408,-9.847 9.847,-9.847c2.832,0 5.391,1.196 7.187,3.111c2.245,-0.443 4.353,-1.263 6.255,-2.391c-0.859,3.44 -4.329,5.448 -4.329,5.448c0,0 2.969,-0.329 5.655,-1.55z"></path></g></g></svg>
|
||||
@@ -173,4 +179,77 @@
|
||||
{% include "partials/tabs.html" %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</header>
|
||||
</header>
|
||||
|
||||
<script>
|
||||
(function() {
|
||||
function checkPathAndRedirect() {
|
||||
var banner = document.getElementById('deprecation-banner');
|
||||
|
||||
if (document.querySelector('meta[http-equiv="refresh"]')) {
|
||||
return; // The redirects plugin is already handling this page.
|
||||
}
|
||||
|
||||
var currentPath = window.location.pathname;
|
||||
|
||||
var cleanPath = currentPath.endsWith('/') && currentPath.length > 1
|
||||
? currentPath.slice(0, -1)
|
||||
: currentPath;
|
||||
|
||||
// These are the ONLY paths that should remain on the old site
|
||||
var apiPaths = [
|
||||
'/lancedb/python',
|
||||
'/lancedb/javascript',
|
||||
'/lancedb/js',
|
||||
'/lancedb/api_reference'
|
||||
];
|
||||
|
||||
var isApiPage = apiPaths.some(function(apiPath) {
|
||||
return cleanPath.startsWith(apiPath);
|
||||
});
|
||||
|
||||
if (isApiPage) {
|
||||
if (banner) {
|
||||
banner.style.display = 'none';
|
||||
}
|
||||
} else {
|
||||
if (banner) {
|
||||
banner.style.display = 'block';
|
||||
}
|
||||
|
||||
// Add noindex meta tag to prevent indexing of old docs for seo
|
||||
var noindexMeta = document.createElement('meta');
|
||||
noindexMeta.setAttribute('name', 'robots');
|
||||
noindexMeta.setAttribute('content', 'noindex, follow');
|
||||
document.head.appendChild(noindexMeta);
|
||||
|
||||
// Add canonical link to point to the new docs to reward new site for seo
|
||||
var canonicalLink = document.createElement('link');
|
||||
canonicalLink.setAttribute('rel', 'canonical');
|
||||
canonicalLink.setAttribute('href', 'https://lancedb.com/docs');
|
||||
document.head.appendChild(canonicalLink);
|
||||
|
||||
window.location.replace('https://lancedb.com/docs');
|
||||
}
|
||||
}
|
||||
|
||||
// Run the check only if doc is ready. This makes sure we catch the initial load
|
||||
// and redirect.
|
||||
if (document.readyState === 'loading') {
|
||||
document.addEventListener('DOMContentLoaded', checkPathAndRedirect);
|
||||
} else {
|
||||
checkPathAndRedirect();
|
||||
}
|
||||
|
||||
// Use an interval to handle subsequent navigation clicks.
|
||||
var lastPath = window.location.pathname;
|
||||
setInterval(function() {
|
||||
if (window.location.pathname !== lastPath) {
|
||||
lastPath = window.location.pathname;
|
||||
checkPathAndRedirect();
|
||||
}
|
||||
}, 2000); // keeping it 2 second to make it easy for user to understand
|
||||
// what's happening
|
||||
|
||||
})();
|
||||
</script>
|
||||
@@ -5,3 +5,4 @@ mkdocstrings[python]==0.25.2
|
||||
griffe
|
||||
mkdocs-render-swagger-plugin
|
||||
pydantic
|
||||
mkdocs-redirects
|
||||
|
||||
@@ -25,6 +25,51 @@ the underlying connection has been closed.
|
||||
|
||||
## Methods
|
||||
|
||||
### cloneTable()
|
||||
|
||||
```ts
|
||||
abstract cloneTable(
|
||||
targetTableName,
|
||||
sourceUri,
|
||||
options?): Promise<Table>
|
||||
```
|
||||
|
||||
Clone a table from a source table.
|
||||
|
||||
A shallow clone creates a new table that shares the underlying data files
|
||||
with the source table but has its own independent manifest. This allows
|
||||
both the source and cloned tables to evolve independently while initially
|
||||
sharing the same data, deletion, and index files.
|
||||
|
||||
#### Parameters
|
||||
|
||||
* **targetTableName**: `string`
|
||||
The name of the target table to create.
|
||||
|
||||
* **sourceUri**: `string`
|
||||
The URI of the source table to clone from.
|
||||
|
||||
* **options?**
|
||||
Clone options.
|
||||
|
||||
* **options.isShallow?**: `boolean`
|
||||
Whether to perform a shallow clone (defaults to true).
|
||||
|
||||
* **options.sourceTag?**: `string`
|
||||
The tag of the source table to clone.
|
||||
|
||||
* **options.sourceVersion?**: `number`
|
||||
The version of the source table to clone.
|
||||
|
||||
* **options.targetNamespace?**: `string`[]
|
||||
The namespace for the target table (defaults to root namespace).
|
||||
|
||||
#### Returns
|
||||
|
||||
`Promise`<[`Table`](Table.md)>
|
||||
|
||||
***
|
||||
|
||||
### close()
|
||||
|
||||
```ts
|
||||
|
||||
@@ -13,7 +13,7 @@ function makeArrowTable(
|
||||
metadata?): ArrowTable
|
||||
```
|
||||
|
||||
An enhanced version of the makeTable function from Apache Arrow
|
||||
An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
||||
that supports nested fields and embeddings columns.
|
||||
|
||||
(typically you do not need to call this function. It will be called automatically
|
||||
|
||||
@@ -78,6 +78,7 @@
|
||||
- [TableNamesOptions](interfaces/TableNamesOptions.md)
|
||||
- [TableStatistics](interfaces/TableStatistics.md)
|
||||
- [TimeoutConfig](interfaces/TimeoutConfig.md)
|
||||
- [TlsConfig](interfaces/TlsConfig.md)
|
||||
- [TokenResponse](interfaces/TokenResponse.md)
|
||||
- [UpdateOptions](interfaces/UpdateOptions.md)
|
||||
- [UpdateResult](interfaces/UpdateResult.md)
|
||||
|
||||
@@ -40,6 +40,14 @@ optional timeoutConfig: TimeoutConfig;
|
||||
|
||||
***
|
||||
|
||||
### tlsConfig?
|
||||
|
||||
```ts
|
||||
optional tlsConfig: TlsConfig;
|
||||
```
|
||||
|
||||
***
|
||||
|
||||
### userAgent?
|
||||
|
||||
```ts
|
||||
|
||||
49
docs/src/js/interfaces/TlsConfig.md
Normal file
49
docs/src/js/interfaces/TlsConfig.md
Normal file
@@ -0,0 +1,49 @@
|
||||
[**@lancedb/lancedb**](../README.md) • **Docs**
|
||||
|
||||
***
|
||||
|
||||
[@lancedb/lancedb](../globals.md) / TlsConfig
|
||||
|
||||
# Interface: TlsConfig
|
||||
|
||||
TLS/mTLS configuration for the remote HTTP client.
|
||||
|
||||
## Properties
|
||||
|
||||
### assertHostname?
|
||||
|
||||
```ts
|
||||
optional assertHostname: boolean;
|
||||
```
|
||||
|
||||
Whether to verify the hostname in the server's certificate.
|
||||
|
||||
***
|
||||
|
||||
### certFile?
|
||||
|
||||
```ts
|
||||
optional certFile: string;
|
||||
```
|
||||
|
||||
Path to the client certificate file (PEM format) for mTLS authentication.
|
||||
|
||||
***
|
||||
|
||||
### keyFile?
|
||||
|
||||
```ts
|
||||
optional keyFile: string;
|
||||
```
|
||||
|
||||
Path to the client private key file (PEM format) for mTLS authentication.
|
||||
|
||||
***
|
||||
|
||||
### sslCaCert?
|
||||
|
||||
```ts
|
||||
optional sslCaCert: string;
|
||||
```
|
||||
|
||||
Path to the CA certificate file (PEM format) for server verification.
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.1</version>
|
||||
<version>0.22.2-beta.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<parent>
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.1</version>
|
||||
<version>0.22.2-beta.0</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.lancedb</groupId>
|
||||
<artifactId>lancedb-parent</artifactId>
|
||||
<version>0.22.1-beta.1</version>
|
||||
<version>0.22.2-beta.0</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>${project.artifactId}</name>
|
||||
<description>LanceDB Java SDK Parent POM</description>
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "lancedb-nodejs"
|
||||
edition.workspace = true
|
||||
version = "0.22.1-beta.1"
|
||||
version = "0.22.2-beta.0"
|
||||
license.workspace = true
|
||||
description.workspace = true
|
||||
repository.workspace = true
|
||||
|
||||
@@ -1,17 +1,5 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
import {
|
||||
Bool,
|
||||
Field,
|
||||
Int32,
|
||||
List,
|
||||
Schema,
|
||||
Struct,
|
||||
Uint8,
|
||||
Utf8,
|
||||
} from "apache-arrow";
|
||||
|
||||
import * as arrow15 from "apache-arrow-15";
|
||||
import * as arrow16 from "apache-arrow-16";
|
||||
import * as arrow17 from "apache-arrow-17";
|
||||
@@ -25,11 +13,9 @@ import {
|
||||
fromTableToBuffer,
|
||||
makeArrowTable,
|
||||
makeEmptyTable,
|
||||
tableFromIPC,
|
||||
} from "../lancedb/arrow";
|
||||
import {
|
||||
EmbeddingFunction,
|
||||
FieldOptions,
|
||||
FunctionOptions,
|
||||
} from "../lancedb/embedding/embedding_function";
|
||||
import { EmbeddingFunctionConfig } from "../lancedb/embedding/registry";
|
||||
@@ -1008,5 +994,64 @@ describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
expect(result).toEqual(null);
|
||||
});
|
||||
});
|
||||
|
||||
describe("boolean null handling", function () {
|
||||
it("should handle null values in nullable boolean fields", () => {
|
||||
const { makeArrowTable } = require("../lancedb/arrow");
|
||||
const schema = new Schema([new Field("test", new arrow.Bool(), true)]);
|
||||
|
||||
// Test with all null values
|
||||
const data = [{ test: null }];
|
||||
const table = makeArrowTable(data, { schema });
|
||||
|
||||
expect(table.numRows).toBe(1);
|
||||
expect(table.schema.names).toEqual(["test"]);
|
||||
expect(table.getChild("test")!.get(0)).toBeNull();
|
||||
});
|
||||
|
||||
it("should handle mixed null and non-null boolean values", () => {
|
||||
const { makeArrowTable } = require("../lancedb/arrow");
|
||||
const schema = new Schema([new Field("test", new Bool(), true)]);
|
||||
|
||||
// Test with mixed values
|
||||
const data = [{ test: true }, { test: null }, { test: false }];
|
||||
const table = makeArrowTable(data, { schema });
|
||||
|
||||
expect(table.numRows).toBe(3);
|
||||
expect(table.getChild("test")!.get(0)).toBe(true);
|
||||
expect(table.getChild("test")!.get(1)).toBeNull();
|
||||
expect(table.getChild("test")!.get(2)).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// Test for the undefined values bug fix
|
||||
describe("undefined values handling", () => {
|
||||
it("should handle mixed undefined and actual values", () => {
|
||||
const schema = new Schema([
|
||||
new Field("text", new Utf8(), true), // nullable
|
||||
new Field("number", new Int32(), true), // nullable
|
||||
new Field("bool", new Bool(), true), // nullable
|
||||
]);
|
||||
|
||||
const data = [
|
||||
{ text: undefined, number: 42, bool: true },
|
||||
{ text: "hello", number: undefined, bool: false },
|
||||
{ text: "world", number: 123, bool: undefined },
|
||||
];
|
||||
const table = makeArrowTable(data, { schema });
|
||||
|
||||
const result = table.toArray();
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe(null);
|
||||
expect(result[0].number).toBe(42);
|
||||
expect(result[0].bool).toBe(true);
|
||||
expect(result[1].text).toBe("hello");
|
||||
expect(result[1].number).toBe(null);
|
||||
expect(result[1].bool).toBe(false);
|
||||
expect(result[2].text).toBe("world");
|
||||
expect(result[2].number).toBe(123);
|
||||
expect(result[2].bool).toBe(null);
|
||||
});
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
@@ -203,3 +203,106 @@ describe("given a connection", () => {
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("clone table functionality", () => {
|
||||
let tmpDir: tmp.DirResult;
|
||||
let db: Connection;
|
||||
beforeEach(async () => {
|
||||
tmpDir = tmp.dirSync({ unsafeCleanup: true });
|
||||
db = await connect(tmpDir.name);
|
||||
});
|
||||
afterEach(() => tmpDir.removeCallback());
|
||||
|
||||
it("should clone a table with latest version (default behavior)", async () => {
|
||||
// Create source table with some data
|
||||
const data = [
|
||||
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
||||
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
||||
];
|
||||
const sourceTable = await db.createTable("source", data);
|
||||
|
||||
// Add more data to create a new version
|
||||
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
||||
await sourceTable.add(moreData);
|
||||
|
||||
// Clone the table (should get latest version with 3 rows)
|
||||
const sourceUri = `${tmpDir.name}/source.lance`;
|
||||
const clonedTable = await db.cloneTable("cloned", sourceUri);
|
||||
|
||||
// Verify cloned table has all 3 rows
|
||||
expect(await clonedTable.countRows()).toBe(3);
|
||||
expect((await db.tableNames()).includes("cloned")).toBe(true);
|
||||
});
|
||||
|
||||
it("should clone a table from a specific version", async () => {
|
||||
// Create source table with initial data
|
||||
const data = [
|
||||
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
||||
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
||||
];
|
||||
const sourceTable = await db.createTable("source", data);
|
||||
|
||||
// Get the initial version
|
||||
const initialVersion = await sourceTable.version();
|
||||
|
||||
// Add more data to create a new version
|
||||
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
||||
await sourceTable.add(moreData);
|
||||
|
||||
// Verify source now has 3 rows
|
||||
expect(await sourceTable.countRows()).toBe(3);
|
||||
|
||||
// Clone from the initial version (should have only 2 rows)
|
||||
const sourceUri = `${tmpDir.name}/source.lance`;
|
||||
const clonedTable = await db.cloneTable("cloned", sourceUri, {
|
||||
sourceVersion: initialVersion,
|
||||
});
|
||||
|
||||
// Verify cloned table has only the initial 2 rows
|
||||
expect(await clonedTable.countRows()).toBe(2);
|
||||
});
|
||||
|
||||
it("should clone a table from a tagged version", async () => {
|
||||
// Create source table with initial data
|
||||
const data = [
|
||||
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
||||
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
||||
];
|
||||
const sourceTable = await db.createTable("source", data);
|
||||
|
||||
// Create a tag for the current version
|
||||
const tags = await sourceTable.tags();
|
||||
await tags.create("v1.0", await sourceTable.version());
|
||||
|
||||
// Add more data after the tag
|
||||
const moreData = [{ id: 3, text: "test", vector: [5.0, 6.0] }];
|
||||
await sourceTable.add(moreData);
|
||||
|
||||
// Verify source now has 3 rows
|
||||
expect(await sourceTable.countRows()).toBe(3);
|
||||
|
||||
// Clone from the tagged version (should have only 2 rows)
|
||||
const sourceUri = `${tmpDir.name}/source.lance`;
|
||||
const clonedTable = await db.cloneTable("cloned", sourceUri, {
|
||||
sourceTag: "v1.0",
|
||||
});
|
||||
|
||||
// Verify cloned table has only the tagged version's 2 rows
|
||||
expect(await clonedTable.countRows()).toBe(2);
|
||||
});
|
||||
|
||||
it("should fail when attempting deep clone", async () => {
|
||||
// Create source table with some data
|
||||
const data = [
|
||||
{ id: 1, text: "hello", vector: [1.0, 2.0] },
|
||||
{ id: 2, text: "world", vector: [3.0, 4.0] },
|
||||
];
|
||||
await db.createTable("source", data);
|
||||
|
||||
// Try to create a deep clone (should fail)
|
||||
const sourceUri = `${tmpDir.name}/source.lance`;
|
||||
await expect(
|
||||
db.cloneTable("cloned", sourceUri, { isShallow: false }),
|
||||
).rejects.toThrow("Deep clone is not yet implemented");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -256,6 +256,60 @@ describe("embedding functions", () => {
|
||||
expect(actual).toHaveProperty("text");
|
||||
});
|
||||
|
||||
it("should handle undefined vector field with embedding function correctly", async () => {
|
||||
@register("undefined_test")
|
||||
class MockEmbeddingFunction extends EmbeddingFunction<string> {
|
||||
ndims() {
|
||||
return 3;
|
||||
}
|
||||
embeddingDataType(): Float {
|
||||
return new Float32();
|
||||
}
|
||||
async computeQueryEmbeddings(_data: string) {
|
||||
return [1, 2, 3];
|
||||
}
|
||||
async computeSourceEmbeddings(data: string[]) {
|
||||
return Array.from({ length: data.length }).fill([
|
||||
1, 2, 3,
|
||||
]) as number[][];
|
||||
}
|
||||
}
|
||||
const func = getRegistry()
|
||||
.get<MockEmbeddingFunction>("undefined_test")!
|
||||
.create();
|
||||
const schema = new Schema([
|
||||
new Field("text", new Utf8(), true),
|
||||
new Field(
|
||||
"vector",
|
||||
new FixedSizeList(3, new Field("item", new Float32(), true)),
|
||||
true,
|
||||
),
|
||||
]);
|
||||
|
||||
const db = await connect(tmpDir.name);
|
||||
const table = await db.createEmptyTable("test_undefined", schema, {
|
||||
embeddingFunction: {
|
||||
function: func,
|
||||
sourceColumn: "text",
|
||||
vectorColumn: "vector",
|
||||
},
|
||||
});
|
||||
|
||||
// Test that undefined, null, and omitted vector fields all work
|
||||
await table.add([{ text: "test1", vector: undefined }]);
|
||||
await table.add([{ text: "test2", vector: null }]);
|
||||
await table.add([{ text: "test3" }]);
|
||||
|
||||
const rows = await table.query().toArray();
|
||||
expect(rows.length).toBe(3);
|
||||
|
||||
// All rows should have vectors computed by the embedding function
|
||||
for (const row of rows) {
|
||||
expect(row.vector).toBeDefined();
|
||||
expect(JSON.parse(JSON.stringify(row.vector))).toEqual([1, 2, 3]);
|
||||
}
|
||||
});
|
||||
|
||||
test.each([new Float16(), new Float32(), new Float64()])(
|
||||
"should be able to provide manual embeddings with multiple float datatype",
|
||||
async (floatType) => {
|
||||
|
||||
@@ -7,7 +7,6 @@ import {
|
||||
ClientConfig,
|
||||
Connection,
|
||||
ConnectionOptions,
|
||||
NativeJsHeaderProvider,
|
||||
TlsConfig,
|
||||
connect,
|
||||
} from "../lancedb";
|
||||
|
||||
@@ -39,7 +39,6 @@ import {
|
||||
Operator,
|
||||
instanceOfFullTextQuery,
|
||||
} from "../lancedb/query";
|
||||
import exp = require("constants");
|
||||
|
||||
describe.each([arrow15, arrow16, arrow17, arrow18])(
|
||||
"Given a table",
|
||||
@@ -488,6 +487,32 @@ describe("merge insert", () => {
|
||||
.execute(newData, { timeoutMs: 0 }),
|
||||
).rejects.toThrow("merge insert timed out");
|
||||
});
|
||||
|
||||
test("useIndex", async () => {
|
||||
const newData = [
|
||||
{ a: 2, b: "x" },
|
||||
{ a: 4, b: "z" },
|
||||
];
|
||||
|
||||
// Test with useIndex(true) - should work fine
|
||||
const result1 = await table
|
||||
.mergeInsert("a")
|
||||
.whenNotMatchedInsertAll()
|
||||
.useIndex(true)
|
||||
.execute(newData);
|
||||
|
||||
expect(result1.numInsertedRows).toBe(1); // Only a=4 should be inserted
|
||||
|
||||
// Test with useIndex(false) - should also work fine
|
||||
const newData2 = [{ a: 5, b: "w" }];
|
||||
const result2 = await table
|
||||
.mergeInsert("a")
|
||||
.whenNotMatchedInsertAll()
|
||||
.useIndex(false)
|
||||
.execute(newData2);
|
||||
|
||||
expect(result2.numInsertedRows).toBe(1); // a=5 should be inserted
|
||||
});
|
||||
});
|
||||
|
||||
describe("When creating an index", () => {
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
"noUnreachableSuper": "error",
|
||||
"noUnsafeFinally": "error",
|
||||
"noUnsafeOptionalChaining": "error",
|
||||
"noUnusedImports": "error",
|
||||
"noUnusedLabels": "error",
|
||||
"noUnusedVariables": "warn",
|
||||
"useIsNan": "error",
|
||||
|
||||
@@ -41,7 +41,6 @@ import {
|
||||
vectorFromArray as badVectorFromArray,
|
||||
makeBuilder,
|
||||
makeData,
|
||||
makeTable,
|
||||
} from "apache-arrow";
|
||||
import { Buffers } from "apache-arrow/data";
|
||||
import { type EmbeddingFunction } from "./embedding/embedding_function";
|
||||
@@ -279,7 +278,7 @@ export class MakeArrowTableOptions {
|
||||
}
|
||||
|
||||
/**
|
||||
* An enhanced version of the {@link makeTable} function from Apache Arrow
|
||||
* An enhanced version of the apache-arrow makeTable function from Apache Arrow
|
||||
* that supports nested fields and embeddings columns.
|
||||
*
|
||||
* (typically you do not need to call this function. It will be called automatically
|
||||
@@ -512,7 +511,11 @@ function* rowPathsAndValues(
|
||||
if (isObject(value)) {
|
||||
yield* rowPathsAndValues(value, [...basePath, key]);
|
||||
} else {
|
||||
yield [[...basePath, key], value];
|
||||
// Skip undefined values - they should be treated the same as missing fields
|
||||
// for embedding function purposes
|
||||
if (value !== undefined) {
|
||||
yield [[...basePath, key], value];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -701,7 +704,7 @@ function transposeData(
|
||||
}
|
||||
return current;
|
||||
});
|
||||
return makeVector(values, field.type);
|
||||
return makeVector(values, field.type, undefined, field.nullable);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -748,9 +751,30 @@ function makeVector(
|
||||
values: unknown[],
|
||||
type?: DataType,
|
||||
stringAsDictionary?: boolean,
|
||||
nullable?: boolean,
|
||||
// biome-ignore lint/suspicious/noExplicitAny: skip
|
||||
): Vector<any> {
|
||||
if (type !== undefined) {
|
||||
// Convert undefined values to null for nullable fields
|
||||
if (nullable) {
|
||||
values = values.map((v) => (v === undefined ? null : v));
|
||||
}
|
||||
|
||||
// workaround for: https://github.com/apache/arrow-js/issues/68
|
||||
if (DataType.isBool(type)) {
|
||||
const hasNonNullValue = values.some((v) => v !== null && v !== undefined);
|
||||
if (!hasNonNullValue) {
|
||||
const nullBitmap = new Uint8Array(Math.ceil(values.length / 8));
|
||||
const data = makeData({
|
||||
type: type,
|
||||
length: values.length,
|
||||
nullCount: values.length,
|
||||
nullBitmap,
|
||||
});
|
||||
return arrowMakeVector(data);
|
||||
}
|
||||
}
|
||||
|
||||
// No need for inference, let Arrow create it
|
||||
if (type instanceof Int) {
|
||||
if (DataType.isInt(type) && type.bitWidth === 64) {
|
||||
@@ -875,7 +899,12 @@ async function applyEmbeddingsFromMetadata(
|
||||
for (const field of schema.fields) {
|
||||
if (!(field.name in columns)) {
|
||||
const nullValues = new Array(table.numRows).fill(null);
|
||||
columns[field.name] = makeVector(nullValues, field.type);
|
||||
columns[field.name] = makeVector(
|
||||
nullValues,
|
||||
field.type,
|
||||
undefined,
|
||||
field.nullable,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -939,7 +968,12 @@ async function applyEmbeddings<T>(
|
||||
} else if (schema != null) {
|
||||
const destField = schema.fields.find((f) => f.name === destColumn);
|
||||
if (destField != null) {
|
||||
newColumns[destColumn] = makeVector([], destField.type);
|
||||
newColumns[destColumn] = makeVector(
|
||||
[],
|
||||
destField.type,
|
||||
undefined,
|
||||
destField.nullable,
|
||||
);
|
||||
} else {
|
||||
throw new Error(
|
||||
`Attempt to apply embeddings to an empty table failed because schema was missing embedding column '${destColumn}'`,
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
import {
|
||||
Data,
|
||||
Schema,
|
||||
SchemaLike,
|
||||
TableLike,
|
||||
fromTableToStreamBuffer,
|
||||
@@ -268,6 +267,33 @@ export abstract class Connection {
|
||||
* @param {string[]} namespace The namespace to drop tables from (defaults to root namespace).
|
||||
*/
|
||||
abstract dropAllTables(namespace?: string[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Clone a table from a source table.
|
||||
*
|
||||
* A shallow clone creates a new table that shares the underlying data files
|
||||
* with the source table but has its own independent manifest. This allows
|
||||
* both the source and cloned tables to evolve independently while initially
|
||||
* sharing the same data, deletion, and index files.
|
||||
*
|
||||
* @param {string} targetTableName - The name of the target table to create.
|
||||
* @param {string} sourceUri - The URI of the source table to clone from.
|
||||
* @param {object} options - Clone options.
|
||||
* @param {string[]} options.targetNamespace - The namespace for the target table (defaults to root namespace).
|
||||
* @param {number} options.sourceVersion - The version of the source table to clone.
|
||||
* @param {string} options.sourceTag - The tag of the source table to clone.
|
||||
* @param {boolean} options.isShallow - Whether to perform a shallow clone (defaults to true).
|
||||
*/
|
||||
abstract cloneTable(
|
||||
targetTableName: string,
|
||||
sourceUri: string,
|
||||
options?: {
|
||||
targetNamespace?: string[];
|
||||
sourceVersion?: number;
|
||||
sourceTag?: string;
|
||||
isShallow?: boolean;
|
||||
},
|
||||
): Promise<Table>;
|
||||
}
|
||||
|
||||
/** @hideconstructor */
|
||||
@@ -332,6 +358,28 @@ export class LocalConnection extends Connection {
|
||||
return new LocalTable(innerTable);
|
||||
}
|
||||
|
||||
async cloneTable(
|
||||
targetTableName: string,
|
||||
sourceUri: string,
|
||||
options?: {
|
||||
targetNamespace?: string[];
|
||||
sourceVersion?: number;
|
||||
sourceTag?: string;
|
||||
isShallow?: boolean;
|
||||
},
|
||||
): Promise<Table> {
|
||||
const innerTable = await this.inner.cloneTable(
|
||||
targetTableName,
|
||||
sourceUri,
|
||||
options?.targetNamespace ?? [],
|
||||
options?.sourceVersion ?? null,
|
||||
options?.sourceTag ?? null,
|
||||
options?.isShallow ?? true,
|
||||
);
|
||||
|
||||
return new LocalTable(innerTable);
|
||||
}
|
||||
|
||||
private getStorageOptions(
|
||||
options?: Partial<CreateTableOptions>,
|
||||
): Record<string, string> | undefined {
|
||||
|
||||
@@ -70,6 +70,23 @@ export class MergeInsertBuilder {
|
||||
this.#schema,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Controls whether to use indexes for the merge operation.
|
||||
*
|
||||
* When set to `true` (the default), the operation will use an index if available
|
||||
* on the join key for improved performance. When set to `false`, it forces a full
|
||||
* table scan even if an index exists. This can be useful for benchmarking or when
|
||||
* the query optimizer chooses a suboptimal path.
|
||||
*
|
||||
* @param useIndex - Whether to use indices for the merge operation. Defaults to `true`.
|
||||
*/
|
||||
useIndex(useIndex: boolean): MergeInsertBuilder {
|
||||
return new MergeInsertBuilder(
|
||||
this.#native.useIndex(useIndex),
|
||||
this.#schema,
|
||||
);
|
||||
}
|
||||
/**
|
||||
* Executes the merge insert operation
|
||||
*
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-arm64",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.darwin-arm64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-darwin-x64",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"os": ["darwin"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.darwin-x64.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-gnu",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-arm64-musl",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["arm64"],
|
||||
"main": "lancedb.linux-arm64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-gnu",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-gnu.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-linux-x64-musl",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"os": ["linux"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.linux-x64-musl.node",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-arm64-msvc",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb-win32-x64-msvc",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"os": ["win32"],
|
||||
"cpu": ["x64"],
|
||||
"main": "lancedb.win32-x64-msvc.node",
|
||||
|
||||
4
nodejs/package-lock.json
generated
4
nodejs/package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "@lancedb/lancedb",
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"cpu": [
|
||||
"x64",
|
||||
"arm64"
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
"ann"
|
||||
],
|
||||
"private": false,
|
||||
"version": "0.22.1-beta.1",
|
||||
"version": "0.22.2-beta.0",
|
||||
"main": "dist/index.js",
|
||||
"exports": {
|
||||
".": "./dist/index.js",
|
||||
|
||||
@@ -213,6 +213,36 @@ impl Connection {
|
||||
Ok(Table::new(tbl))
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn clone_table(
|
||||
&self,
|
||||
target_table_name: String,
|
||||
source_uri: String,
|
||||
target_namespace: Vec<String>,
|
||||
source_version: Option<i64>,
|
||||
source_tag: Option<String>,
|
||||
is_shallow: bool,
|
||||
) -> napi::Result<Table> {
|
||||
let mut builder = self
|
||||
.get_inner()?
|
||||
.clone_table(&target_table_name, &source_uri);
|
||||
|
||||
builder = builder.target_namespace(target_namespace);
|
||||
|
||||
if let Some(version) = source_version {
|
||||
builder = builder.source_version(version as u64);
|
||||
}
|
||||
|
||||
if let Some(tag) = source_tag {
|
||||
builder = builder.source_tag(tag);
|
||||
}
|
||||
|
||||
builder = builder.is_shallow(is_shallow);
|
||||
|
||||
let tbl = builder.execute().await.default_error()?;
|
||||
Ok(Table::new(tbl))
|
||||
}
|
||||
|
||||
/// Drop table with the name. Or raise an error if the table does not exist.
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn drop_table(&self, name: String, namespace: Vec<String>) -> napi::Result<()> {
|
||||
|
||||
@@ -43,6 +43,13 @@ impl NativeMergeInsertBuilder {
|
||||
self.inner.timeout(Duration::from_millis(timeout as u64));
|
||||
}
|
||||
|
||||
#[napi]
|
||||
pub fn use_index(&self, use_index: bool) -> Self {
|
||||
let mut this = self.clone();
|
||||
this.inner.use_index(use_index);
|
||||
this
|
||||
}
|
||||
|
||||
#[napi(catch_unwind)]
|
||||
pub async fn execute(&self, buf: Buffer) -> napi::Result<MergeResult> {
|
||||
let data = ipc_file_to_batches(buf.to_vec())
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[tool.bumpversion]
|
||||
current_version = "0.25.1-beta.2"
|
||||
current_version = "0.25.2-beta.1"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb-python"
|
||||
version = "0.25.1-beta.2"
|
||||
version = "0.25.2-beta.1"
|
||||
edition.workspace = true
|
||||
description = "Python bindings for LanceDB"
|
||||
license.workspace = true
|
||||
|
||||
@@ -10,7 +10,7 @@ dependencies = [
|
||||
"pyarrow>=16",
|
||||
"pydantic>=1.10",
|
||||
"tqdm>=4.27.0",
|
||||
"lance-namespace==0.0.6"
|
||||
"lance-namespace>=0.0.16"
|
||||
]
|
||||
description = "lancedb"
|
||||
authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
|
||||
|
||||
@@ -60,6 +60,15 @@ class Connection(object):
|
||||
storage_options: Optional[Dict[str, str]] = None,
|
||||
index_cache_size: Optional[int] = None,
|
||||
) -> Table: ...
|
||||
async def clone_table(
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
target_namespace: List[str] = [],
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
) -> Table: ...
|
||||
async def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
|
||||
@@ -665,6 +665,60 @@ class LanceDBConnection(DBConnection):
|
||||
index_cache_size=index_cache_size,
|
||||
)
|
||||
|
||||
def clone_table(
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
*,
|
||||
target_namespace: List[str] = [],
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
) -> LanceTable:
|
||||
"""Clone a table from a source table.
|
||||
|
||||
A shallow clone creates a new table that shares the underlying data files
|
||||
with the source table but has its own independent manifest. This allows
|
||||
both the source and cloned tables to evolve independently while initially
|
||||
sharing the same data, deletion, and index files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_table_name: str
|
||||
The name of the target table to create.
|
||||
source_uri: str
|
||||
The URI of the source table to clone from.
|
||||
target_namespace: List[str], optional
|
||||
The namespace for the target table.
|
||||
None or empty list represents root namespace.
|
||||
source_version: int, optional
|
||||
The version of the source table to clone.
|
||||
source_tag: str, optional
|
||||
The tag of the source table to clone.
|
||||
is_shallow: bool, default True
|
||||
Whether to perform a shallow clone (True) or deep clone (False).
|
||||
Currently only shallow clone is supported.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A LanceTable object representing the cloned table.
|
||||
"""
|
||||
LOOP.run(
|
||||
self._conn.clone_table(
|
||||
target_table_name,
|
||||
source_uri,
|
||||
target_namespace=target_namespace,
|
||||
source_version=source_version,
|
||||
source_tag=source_tag,
|
||||
is_shallow=is_shallow,
|
||||
)
|
||||
)
|
||||
return LanceTable.open(
|
||||
self,
|
||||
target_table_name,
|
||||
namespace=target_namespace,
|
||||
)
|
||||
|
||||
@override
|
||||
def drop_table(
|
||||
self,
|
||||
@@ -1136,6 +1190,54 @@ class AsyncConnection(object):
|
||||
)
|
||||
return AsyncTable(table)
|
||||
|
||||
async def clone_table(
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
*,
|
||||
target_namespace: List[str] = [],
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
) -> AsyncTable:
|
||||
"""Clone a table from a source table.
|
||||
|
||||
A shallow clone creates a new table that shares the underlying data files
|
||||
with the source table but has its own independent manifest. This allows
|
||||
both the source and cloned tables to evolve independently while initially
|
||||
sharing the same data, deletion, and index files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_table_name: str
|
||||
The name of the target table to create.
|
||||
source_uri: str
|
||||
The URI of the source table to clone from.
|
||||
target_namespace: List[str], optional
|
||||
The namespace for the target table.
|
||||
None or empty list represents root namespace.
|
||||
source_version: int, optional
|
||||
The version of the source table to clone.
|
||||
source_tag: str, optional
|
||||
The tag of the source table to clone.
|
||||
is_shallow: bool, default True
|
||||
Whether to perform a shallow clone (True) or deep clone (False).
|
||||
Currently only shallow clone is supported.
|
||||
|
||||
Returns
|
||||
-------
|
||||
An AsyncTable object representing the cloned table.
|
||||
"""
|
||||
table = await self._inner.clone_table(
|
||||
target_table_name,
|
||||
source_uri,
|
||||
target_namespace=target_namespace,
|
||||
source_version=source_version,
|
||||
source_tag=source_tag,
|
||||
is_shallow=is_shallow,
|
||||
)
|
||||
return AsyncTable(table)
|
||||
|
||||
async def rename_table(
|
||||
self,
|
||||
cur_name: str,
|
||||
|
||||
@@ -33,6 +33,7 @@ class LanceMergeInsertBuilder(object):
|
||||
self._when_not_matched_by_source_delete = False
|
||||
self._when_not_matched_by_source_condition = None
|
||||
self._timeout = None
|
||||
self._use_index = True
|
||||
|
||||
def when_matched_update_all(
|
||||
self, *, where: Optional[str] = None
|
||||
@@ -78,6 +79,23 @@ class LanceMergeInsertBuilder(object):
|
||||
self._when_not_matched_by_source_condition = condition
|
||||
return self
|
||||
|
||||
def use_index(self, use_index: bool) -> LanceMergeInsertBuilder:
|
||||
"""
|
||||
Controls whether to use indexes for the merge operation.
|
||||
|
||||
When set to `True` (the default), the operation will use an index if available
|
||||
on the join key for improved performance. When set to `False`, it forces a full
|
||||
table scan even if an index exists. This can be useful for benchmarking or when
|
||||
the query optimizer chooses a suboptimal path.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
use_index: bool
|
||||
Whether to use indices for the merge operation. Defaults to `True`.
|
||||
"""
|
||||
self._use_index = use_index
|
||||
return self
|
||||
|
||||
def execute(
|
||||
self,
|
||||
new_data: DATA,
|
||||
|
||||
@@ -212,6 +212,53 @@ class RemoteDBConnection(DBConnection):
|
||||
table = LOOP.run(self._conn.open_table(name, namespace=namespace))
|
||||
return RemoteTable(table, self.db_name)
|
||||
|
||||
def clone_table(
|
||||
self,
|
||||
target_table_name: str,
|
||||
source_uri: str,
|
||||
*,
|
||||
target_namespace: List[str] = [],
|
||||
source_version: Optional[int] = None,
|
||||
source_tag: Optional[str] = None,
|
||||
is_shallow: bool = True,
|
||||
) -> Table:
|
||||
"""Clone a table from a source table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
target_table_name: str
|
||||
The name of the target table to create.
|
||||
source_uri: str
|
||||
The URI of the source table to clone from.
|
||||
target_namespace: List[str], optional
|
||||
The namespace for the target table.
|
||||
None or empty list represents root namespace.
|
||||
source_version: int, optional
|
||||
The version of the source table to clone.
|
||||
source_tag: str, optional
|
||||
The tag of the source table to clone.
|
||||
is_shallow: bool, default True
|
||||
Whether to perform a shallow clone (True) or deep clone (False).
|
||||
Currently only shallow clone is supported.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A RemoteTable object representing the cloned table.
|
||||
"""
|
||||
from .table import RemoteTable
|
||||
|
||||
table = LOOP.run(
|
||||
self._conn.clone_table(
|
||||
target_table_name,
|
||||
source_uri,
|
||||
target_namespace=target_namespace,
|
||||
source_version=source_version,
|
||||
source_tag=source_tag,
|
||||
is_shallow=is_shallow,
|
||||
)
|
||||
)
|
||||
return RemoteTable(table, self.db_name)
|
||||
|
||||
@override
|
||||
def create_table(
|
||||
self,
|
||||
|
||||
@@ -9,6 +9,7 @@ from .linear_combination import LinearCombinationReranker
|
||||
from .openai import OpenaiReranker
|
||||
from .jinaai import JinaReranker
|
||||
from .rrf import RRFReranker
|
||||
from .mrr import MRRReranker
|
||||
from .answerdotai import AnswerdotaiRerankers
|
||||
from .voyageai import VoyageAIReranker
|
||||
|
||||
@@ -23,4 +24,5 @@ __all__ = [
|
||||
"RRFReranker",
|
||||
"AnswerdotaiRerankers",
|
||||
"VoyageAIReranker",
|
||||
"MRRReranker",
|
||||
]
|
||||
|
||||
169
python/python/lancedb/rerankers/mrr.py
Normal file
169
python/python/lancedb/rerankers/mrr.py
Normal file
@@ -0,0 +1,169 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
|
||||
from typing import Union, List, TYPE_CHECKING
|
||||
import pyarrow as pa
|
||||
import numpy as np
|
||||
|
||||
from collections import defaultdict
|
||||
from .base import Reranker
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..table import LanceVectorQueryBuilder
|
||||
|
||||
|
||||
class MRRReranker(Reranker):
|
||||
"""
|
||||
Reranks the results using Mean Reciprocal Rank (MRR) algorithm based
|
||||
on the scores of vector and FTS search.
|
||||
Algorithm reference - https://en.wikipedia.org/wiki/Mean_reciprocal_rank
|
||||
|
||||
MRR calculates the average of reciprocal ranks across different search results.
|
||||
For each document, it computes the reciprocal of its rank in each system,
|
||||
then takes the mean of these reciprocal ranks as the final score.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
weight_vector : float, default 0.5
|
||||
Weight for vector search results (0.0 to 1.0)
|
||||
weight_fts : float, default 0.5
|
||||
Weight for FTS search results (0.0 to 1.0)
|
||||
Note: weight_vector + weight_fts should equal 1.0
|
||||
return_score : str, default "relevance"
|
||||
Options are "relevance" or "all"
|
||||
The type of score to return. If "relevance", will return only the relevance
|
||||
score. If "all", will return all scores from the vector and FTS search along
|
||||
with the relevance score.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
weight_vector: float = 0.5,
|
||||
weight_fts: float = 0.5,
|
||||
return_score="relevance",
|
||||
):
|
||||
if not (0.0 <= weight_vector <= 1.0):
|
||||
raise ValueError("weight_vector must be between 0.0 and 1.0")
|
||||
if not (0.0 <= weight_fts <= 1.0):
|
||||
raise ValueError("weight_fts must be between 0.0 and 1.0")
|
||||
if abs(weight_vector + weight_fts - 1.0) > 1e-6:
|
||||
raise ValueError("weight_vector + weight_fts must equal 1.0")
|
||||
|
||||
super().__init__(return_score)
|
||||
self.weight_vector = weight_vector
|
||||
self.weight_fts = weight_fts
|
||||
|
||||
def rerank_hybrid(
|
||||
self,
|
||||
query: str, # noqa: F821
|
||||
vector_results: pa.Table,
|
||||
fts_results: pa.Table,
|
||||
):
|
||||
vector_ids = vector_results["_rowid"].to_pylist() if vector_results else []
|
||||
fts_ids = fts_results["_rowid"].to_pylist() if fts_results else []
|
||||
|
||||
# Maps result_id to list of (type, reciprocal_rank)
|
||||
mrr_score_map = defaultdict(list)
|
||||
|
||||
if vector_ids:
|
||||
for rank, result_id in enumerate(vector_ids, 1):
|
||||
reciprocal_rank = 1.0 / rank
|
||||
mrr_score_map[result_id].append(("vector", reciprocal_rank))
|
||||
|
||||
if fts_ids:
|
||||
for rank, result_id in enumerate(fts_ids, 1):
|
||||
reciprocal_rank = 1.0 / rank
|
||||
mrr_score_map[result_id].append(("fts", reciprocal_rank))
|
||||
|
||||
final_mrr_scores = {}
|
||||
for result_id, scores in mrr_score_map.items():
|
||||
vector_rr = 0.0
|
||||
fts_rr = 0.0
|
||||
|
||||
for score_type, reciprocal_rank in scores:
|
||||
if score_type == "vector":
|
||||
vector_rr = reciprocal_rank
|
||||
elif score_type == "fts":
|
||||
fts_rr = reciprocal_rank
|
||||
|
||||
# If a document doesn't appear, its reciprocal rank is 0
|
||||
weighted_mrr = self.weight_vector * vector_rr + self.weight_fts * fts_rr
|
||||
final_mrr_scores[result_id] = weighted_mrr
|
||||
|
||||
combined_results = self.merge_results(vector_results, fts_results)
|
||||
combined_row_ids = combined_results["_rowid"].to_pylist()
|
||||
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
|
||||
combined_results = combined_results.append_column(
|
||||
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
|
||||
)
|
||||
combined_results = combined_results.sort_by(
|
||||
[("_relevance_score", "descending")]
|
||||
)
|
||||
|
||||
if self.score == "relevance":
|
||||
combined_results = self._keep_relevance_score(combined_results)
|
||||
|
||||
return combined_results
|
||||
|
||||
def rerank_multivector(
|
||||
self,
|
||||
vector_results: Union[List[pa.Table], List["LanceVectorQueryBuilder"]],
|
||||
query: str = None,
|
||||
deduplicate: bool = True, # noqa: F821
|
||||
):
|
||||
"""
|
||||
Reranks the results from multiple vector searches using MRR algorithm.
|
||||
Each vector search result is treated as a separate ranking system,
|
||||
and MRR calculates the mean of reciprocal ranks across all systems.
|
||||
This cannot reuse rerank_hybrid because MRR semantics require treating
|
||||
each vector result as a separate ranking system.
|
||||
"""
|
||||
if not all(isinstance(v, type(vector_results[0])) for v in vector_results):
|
||||
raise ValueError(
|
||||
"All elements in vector_results should be of the same type"
|
||||
)
|
||||
|
||||
# avoid circular import
|
||||
if type(vector_results[0]).__name__ == "LanceVectorQueryBuilder":
|
||||
vector_results = [result.to_arrow() for result in vector_results]
|
||||
elif not isinstance(vector_results[0], pa.Table):
|
||||
raise ValueError(
|
||||
"vector_results should be a list of pa.Table or LanceVectorQueryBuilder"
|
||||
)
|
||||
|
||||
if not all("_rowid" in result.column_names for result in vector_results):
|
||||
raise ValueError(
|
||||
"'_rowid' is required for deduplication. \
|
||||
add _rowid to search results like this: \
|
||||
`search().with_row_id(True)`"
|
||||
)
|
||||
|
||||
mrr_score_map = defaultdict(list)
|
||||
|
||||
for result_table in vector_results:
|
||||
result_ids = result_table["_rowid"].to_pylist()
|
||||
for rank, result_id in enumerate(result_ids, 1):
|
||||
reciprocal_rank = 1.0 / rank
|
||||
mrr_score_map[result_id].append(reciprocal_rank)
|
||||
|
||||
final_mrr_scores = {}
|
||||
for result_id, reciprocal_ranks in mrr_score_map.items():
|
||||
mean_rr = np.mean(reciprocal_ranks)
|
||||
final_mrr_scores[result_id] = mean_rr
|
||||
|
||||
combined = pa.concat_tables(vector_results, **self._concat_tables_args)
|
||||
combined = self._deduplicate(combined)
|
||||
|
||||
combined_row_ids = combined["_rowid"].to_pylist()
|
||||
|
||||
relevance_scores = [final_mrr_scores[row_id] for row_id in combined_row_ids]
|
||||
combined = combined.append_column(
|
||||
"_relevance_score", pa.array(relevance_scores, type=pa.float32())
|
||||
)
|
||||
combined = combined.sort_by([("_relevance_score", "descending")])
|
||||
|
||||
if self.score == "relevance":
|
||||
combined = self._keep_relevance_score(combined)
|
||||
|
||||
return combined
|
||||
@@ -1470,10 +1470,7 @@ class Table(ABC):
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -2835,10 +2832,7 @@ class LanceTable(Table):
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -3926,6 +3920,7 @@ class AsyncTable:
|
||||
when_not_matched_by_source_delete=merge._when_not_matched_by_source_delete,
|
||||
when_not_matched_by_source_condition=merge._when_not_matched_by_source_condition,
|
||||
timeout=merge._timeout,
|
||||
use_index=merge._use_index,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -4298,10 +4293,7 @@ class AsyncTable:
|
||||
be deleted unless they are at least 7 days old. If delete_unverified is True
|
||||
then these files will be deleted regardless of their age.
|
||||
retrain: bool, default False
|
||||
If True, retrain the vector indices, this would refine the IVF clustering
|
||||
and quantization, which may improve the search accuracy. It's faster than
|
||||
re-creating the index from scratch, so it's recommended to try this first,
|
||||
when the data distribution has changed significantly.
|
||||
This parameter is no longer used and is deprecated.
|
||||
|
||||
Experimental API
|
||||
----------------
|
||||
@@ -4324,10 +4316,19 @@ class AsyncTable:
|
||||
cleanup_since_ms: Optional[int] = None
|
||||
if cleanup_older_than is not None:
|
||||
cleanup_since_ms = round(cleanup_older_than.total_seconds() * 1000)
|
||||
|
||||
if retrain:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"The 'retrain' parameter is deprecated and will be removed in a "
|
||||
"future version.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
|
||||
return await self._inner.optimize(
|
||||
cleanup_since_ms=cleanup_since_ms,
|
||||
delete_unverified=delete_unverified,
|
||||
retrain=retrain,
|
||||
)
|
||||
|
||||
async def list_indices(self) -> Iterable[IndexConfig]:
|
||||
|
||||
@@ -747,15 +747,16 @@ def test_local_namespace_operations(tmp_path):
|
||||
# Create a local database connection
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Test list_namespaces returns empty list
|
||||
# Test list_namespaces returns empty list for root namespace
|
||||
namespaces = list(db.list_namespaces())
|
||||
assert namespaces == []
|
||||
|
||||
# Test list_namespaces with parameters still returns empty list
|
||||
namespaces_with_params = list(
|
||||
db.list_namespaces(namespace=["test"], page_token="token", limit=5)
|
||||
)
|
||||
assert namespaces_with_params == []
|
||||
# Test list_namespaces with non-empty namespace raises NotImplementedError
|
||||
with pytest.raises(
|
||||
NotImplementedError,
|
||||
match="Namespace operations are not supported for listing database",
|
||||
):
|
||||
list(db.list_namespaces(namespace=["test"]))
|
||||
|
||||
|
||||
def test_local_create_namespace_not_supported(tmp_path):
|
||||
@@ -830,3 +831,119 @@ def test_local_table_operations_with_namespace_raise_error(tmp_path):
|
||||
# Test table_names without namespace - should work normally
|
||||
tables_root = list(db.table_names())
|
||||
assert "test_table" in tables_root
|
||||
|
||||
|
||||
def test_clone_table_latest_version(tmp_path):
|
||||
"""Test cloning a table with the latest version (default behavior)"""
|
||||
import os
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create source table with some data
|
||||
data = [
|
||||
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||
]
|
||||
source_table = db.create_table("source", data=data)
|
||||
|
||||
# Add more data to create a new version
|
||||
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||
source_table.add(more_data)
|
||||
|
||||
# Clone the table (should get latest version with 3 rows)
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
cloned_table = db.clone_table("cloned", source_uri)
|
||||
|
||||
# Verify cloned table has all 3 rows
|
||||
assert cloned_table.count_rows() == 3
|
||||
assert "cloned" in db.table_names()
|
||||
|
||||
# Verify data matches
|
||||
cloned_data = cloned_table.to_pandas()
|
||||
assert len(cloned_data) == 3
|
||||
assert set(cloned_data["id"].tolist()) == {1, 2, 3}
|
||||
|
||||
|
||||
def test_clone_table_specific_version(tmp_path):
|
||||
"""Test cloning a table from a specific version"""
|
||||
import os
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create source table with initial data
|
||||
data = [
|
||||
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||
]
|
||||
source_table = db.create_table("source", data=data)
|
||||
|
||||
# Get the initial version
|
||||
initial_version = source_table.version
|
||||
|
||||
# Add more data to create a new version
|
||||
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||
source_table.add(more_data)
|
||||
|
||||
# Verify source now has 3 rows
|
||||
assert source_table.count_rows() == 3
|
||||
|
||||
# Clone from the initial version (should have only 2 rows)
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
cloned_table = db.clone_table("cloned", source_uri, source_version=initial_version)
|
||||
|
||||
# Verify cloned table has only the initial 2 rows
|
||||
assert cloned_table.count_rows() == 2
|
||||
cloned_data = cloned_table.to_pandas()
|
||||
assert set(cloned_data["id"].tolist()) == {1, 2}
|
||||
|
||||
|
||||
def test_clone_table_with_tag(tmp_path):
|
||||
"""Test cloning a table from a tagged version"""
|
||||
import os
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create source table with initial data
|
||||
data = [
|
||||
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||
]
|
||||
source_table = db.create_table("source", data=data)
|
||||
|
||||
# Create a tag for the current version
|
||||
source_table.tags.create("v1.0", source_table.version)
|
||||
|
||||
# Add more data after the tag
|
||||
more_data = [{"id": 3, "text": "test", "vector": [5.0, 6.0]}]
|
||||
source_table.add(more_data)
|
||||
|
||||
# Verify source now has 3 rows
|
||||
assert source_table.count_rows() == 3
|
||||
|
||||
# Clone from the tagged version (should have only 2 rows)
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
cloned_table = db.clone_table("cloned", source_uri, source_tag="v1.0")
|
||||
|
||||
# Verify cloned table has only the tagged version's 2 rows
|
||||
assert cloned_table.count_rows() == 2
|
||||
cloned_data = cloned_table.to_pandas()
|
||||
assert set(cloned_data["id"].tolist()) == {1, 2}
|
||||
|
||||
|
||||
def test_clone_table_deep_clone_fails(tmp_path):
|
||||
"""Test that deep clone raises an unsupported error"""
|
||||
import os
|
||||
|
||||
db = lancedb.connect(tmp_path)
|
||||
|
||||
# Create source table with some data
|
||||
data = [
|
||||
{"id": 1, "text": "hello", "vector": [1.0, 2.0]},
|
||||
{"id": 2, "text": "world", "vector": [3.0, 4.0]},
|
||||
]
|
||||
db.create_table("source", data=data)
|
||||
|
||||
# Try to create a deep clone (should fail)
|
||||
source_uri = os.path.join(tmp_path, "source.lance")
|
||||
with pytest.raises(Exception, match="Deep clone is not yet implemented"):
|
||||
db.clone_table("cloned", source_uri, is_shallow=False)
|
||||
|
||||
@@ -35,6 +35,8 @@ async def some_table(db_async):
|
||||
"tags": [
|
||||
[f"tag{random.randint(0, 8)}" for _ in range(2)] for _ in range(NROWS)
|
||||
],
|
||||
"is_active": [random.choice([True, False]) for _ in range(NROWS)],
|
||||
"data": [random.randbytes(random.randint(0, 128)) for _ in range(NROWS)],
|
||||
}
|
||||
)
|
||||
return await db_async.create_table(
|
||||
@@ -99,10 +101,17 @@ async def test_create_fixed_size_binary_index(some_table: AsyncTable):
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
await some_table.create_index("id", config=Bitmap())
|
||||
await some_table.create_index("is_active", config=Bitmap())
|
||||
await some_table.create_index("data", config=Bitmap())
|
||||
indices = await some_table.list_indices()
|
||||
assert str(indices) == '[Index(Bitmap, columns=["id"], name="id_idx")]'
|
||||
indices = await some_table.list_indices()
|
||||
assert len(indices) == 1
|
||||
assert len(indices) == 3
|
||||
assert indices[0].index_type == "Bitmap"
|
||||
assert indices[0].columns == ["id"]
|
||||
assert indices[1].index_type == "Bitmap"
|
||||
assert indices[1].columns == ["is_active"]
|
||||
assert indices[2].index_type == "Bitmap"
|
||||
assert indices[2].columns == ["data"]
|
||||
|
||||
index_name = indices[0].name
|
||||
stats = await some_table.index_stats(index_name)
|
||||
assert stats.index_type == "BITMAP"
|
||||
@@ -111,6 +120,11 @@ async def test_create_bitmap_index(some_table: AsyncTable):
|
||||
assert stats.num_unindexed_rows == 0
|
||||
assert stats.num_indices == 1
|
||||
|
||||
assert (
|
||||
"ScalarIndexQuery"
|
||||
in await some_table.query().where("is_active = TRUE").explain_plan()
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_create_label_list_index(some_table: AsyncTable):
|
||||
|
||||
@@ -22,6 +22,7 @@ from lancedb.rerankers import (
|
||||
JinaReranker,
|
||||
AnswerdotaiRerankers,
|
||||
VoyageAIReranker,
|
||||
MRRReranker,
|
||||
)
|
||||
from lancedb.table import LanceTable
|
||||
|
||||
@@ -46,6 +47,7 @@ def get_test_table(tmp_path, use_tantivy):
|
||||
db,
|
||||
"my_table",
|
||||
schema=MyTable,
|
||||
mode="overwrite",
|
||||
)
|
||||
|
||||
# Need to test with a bunch of phrases to make sure sorting is consistent
|
||||
@@ -96,7 +98,7 @@ def get_test_table(tmp_path, use_tantivy):
|
||||
)
|
||||
|
||||
# Create a fts index
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy, replace=True)
|
||||
|
||||
return table, MyTable
|
||||
|
||||
@@ -320,6 +322,34 @@ def test_rrf_reranker(tmp_path, use_tantivy):
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_mrr_reranker(tmp_path, use_tantivy):
|
||||
reranker = MRRReranker()
|
||||
_run_test_hybrid_reranker(reranker, tmp_path, use_tantivy)
|
||||
|
||||
# Test multi-vector part
|
||||
table, schema = get_test_table(tmp_path, use_tantivy)
|
||||
query = "single player experience"
|
||||
rs1 = table.search(query, vector_column_name="vector").limit(10).with_row_id(True)
|
||||
rs2 = (
|
||||
table.search(query, vector_column_name="meta_vector")
|
||||
.limit(10)
|
||||
.with_row_id(True)
|
||||
)
|
||||
result = reranker.rerank_multivector([rs1, rs2])
|
||||
assert "_relevance_score" in result.column_names
|
||||
assert len(result) <= 20
|
||||
|
||||
if len(result) > 1:
|
||||
assert np.all(np.diff(result.column("_relevance_score").to_numpy()) <= 0), (
|
||||
"The _relevance_score should be descending."
|
||||
)
|
||||
|
||||
# Test with duplicate results
|
||||
result_deduped = reranker.rerank_multivector([rs1, rs2, rs1])
|
||||
assert len(result_deduped) == len(result)
|
||||
|
||||
|
||||
def test_rrf_reranker_distance():
|
||||
data = pa.table(
|
||||
{
|
||||
|
||||
@@ -163,6 +163,34 @@ impl Connection {
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (target_table_name, source_uri, target_namespace=vec![], source_version=None, source_tag=None, is_shallow=true))]
|
||||
pub fn clone_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
target_table_name: String,
|
||||
source_uri: String,
|
||||
target_namespace: Vec<String>,
|
||||
source_version: Option<u64>,
|
||||
source_tag: Option<String>,
|
||||
is_shallow: bool,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.get_inner()?.clone();
|
||||
|
||||
let mut builder = inner.clone_table(target_table_name, source_uri);
|
||||
builder = builder.target_namespace(target_namespace);
|
||||
if let Some(version) = source_version {
|
||||
builder = builder.source_version(version);
|
||||
}
|
||||
if let Some(tag) = source_tag {
|
||||
builder = builder.source_tag(tag);
|
||||
}
|
||||
builder = builder.is_shallow(is_shallow);
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
let table = builder.execute().await.infer_error()?;
|
||||
Ok(Table::new(table))
|
||||
})
|
||||
}
|
||||
|
||||
#[pyo3(signature = (cur_name, new_name, cur_namespace=vec![], new_namespace=vec![]))]
|
||||
pub fn rename_table(
|
||||
self_: PyRef<'_, Self>,
|
||||
|
||||
@@ -591,12 +591,11 @@ impl Table {
|
||||
}
|
||||
|
||||
/// Optimize the on-disk data by compacting and pruning old data, for better performance.
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None, retrain=None))]
|
||||
#[pyo3(signature = (cleanup_since_ms=None, delete_unverified=None))]
|
||||
pub fn optimize(
|
||||
self_: PyRef<'_, Self>,
|
||||
cleanup_since_ms: Option<u64>,
|
||||
delete_unverified: Option<bool>,
|
||||
retrain: Option<bool>,
|
||||
) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner_ref()?.clone();
|
||||
let older_than = if let Some(ms) = cleanup_since_ms {
|
||||
@@ -632,10 +631,9 @@ impl Table {
|
||||
.prune
|
||||
.unwrap();
|
||||
inner
|
||||
.optimize(lancedb::table::OptimizeAction::Index(match retrain {
|
||||
Some(true) => OptimizeOptions::retrain(),
|
||||
_ => OptimizeOptions::default(),
|
||||
}))
|
||||
.optimize(lancedb::table::OptimizeAction::Index(
|
||||
OptimizeOptions::default(),
|
||||
))
|
||||
.await
|
||||
.infer_error()?;
|
||||
Ok(OptimizeStats {
|
||||
@@ -674,6 +672,9 @@ impl Table {
|
||||
if let Some(timeout) = parameters.timeout {
|
||||
builder.timeout(timeout);
|
||||
}
|
||||
if let Some(use_index) = parameters.use_index {
|
||||
builder.use_index(use_index);
|
||||
}
|
||||
|
||||
future_into_py(self_.py(), async move {
|
||||
let res = builder.execute(Box::new(batches)).await.infer_error()?;
|
||||
@@ -833,6 +834,7 @@ pub struct MergeInsertParams {
|
||||
when_not_matched_by_source_delete: bool,
|
||||
when_not_matched_by_source_condition: Option<String>,
|
||||
timeout: Option<std::time::Duration>,
|
||||
use_index: Option<bool>,
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "lancedb"
|
||||
version = "0.22.1-beta.1"
|
||||
version = "0.22.2-beta.0"
|
||||
edition.workspace = true
|
||||
description = "LanceDB: A serverless, low-latency vector database for AI applications"
|
||||
license.workspace = true
|
||||
@@ -36,6 +36,7 @@ lance-table = { workspace = true }
|
||||
lance-linalg = { workspace = true }
|
||||
lance-testing = { workspace = true }
|
||||
lance-encoding = { workspace = true }
|
||||
lance-namespace = { workspace = true }
|
||||
moka = { workspace = true }
|
||||
pin-project = { workspace = true }
|
||||
tokio = { version = "1.23", features = ["rt-multi-thread"] }
|
||||
@@ -81,6 +82,7 @@ crunchy.workspace = true
|
||||
bytemuck_derive.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
anyhow = "1"
|
||||
tempfile = "3.5.0"
|
||||
rand = { version = "0.9", features = ["small_rng"] }
|
||||
random_word = { version = "0.4.3", features = ["en"] }
|
||||
|
||||
19
rust/lancedb/Makefile
Normal file
19
rust/lancedb/Makefile
Normal file
@@ -0,0 +1,19 @@
|
||||
#
|
||||
# Makefile for running tests.
|
||||
#
|
||||
|
||||
# Run all tests.
|
||||
all-tests: feature-tests remote-tests
|
||||
|
||||
# Run tests for every feature. This requires using docker compose to set up
|
||||
# the environment.
|
||||
feature-tests:
|
||||
../../ci/run_with_docker_compose.sh \
|
||||
cargo test --all-features --tests --locked --examples
|
||||
.PHONY: feature-tests
|
||||
|
||||
# Run tests against remote endpoints.
|
||||
remote-tests:
|
||||
../../ci/run_with_test_connection.sh \
|
||||
cargo test --features remote --locked
|
||||
.PHONY: remote-tests
|
||||
@@ -17,9 +17,9 @@ use crate::database::listing::{
|
||||
ListingDatabase, OPT_NEW_TABLE_STORAGE_VERSION, OPT_NEW_TABLE_V2_MANIFEST_PATHS,
|
||||
};
|
||||
use crate::database::{
|
||||
CreateNamespaceRequest, CreateTableData, CreateTableMode, CreateTableRequest, Database,
|
||||
DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest, OpenTableRequest,
|
||||
TableNamesRequest,
|
||||
CloneTableRequest, CreateNamespaceRequest, CreateTableData, CreateTableMode,
|
||||
CreateTableRequest, Database, DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest,
|
||||
OpenTableRequest, TableNamesRequest,
|
||||
};
|
||||
use crate::embeddings::{
|
||||
EmbeddingDefinition, EmbeddingFunction, EmbeddingRegistry, MemoryRegistry, WithEmbeddings,
|
||||
@@ -469,6 +469,62 @@ impl OpenTableBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder for cloning a table.
|
||||
///
|
||||
/// A shallow clone creates a new table that shares the underlying data files
|
||||
/// with the source table but has its own independent manifest. Both the source
|
||||
/// and cloned tables can evolve independently while initially sharing the same
|
||||
/// data, deletion, and index files.
|
||||
///
|
||||
/// Use this builder to configure the clone operation before executing it.
|
||||
pub struct CloneTableBuilder {
|
||||
parent: Arc<dyn Database>,
|
||||
request: CloneTableRequest,
|
||||
}
|
||||
|
||||
impl CloneTableBuilder {
|
||||
fn new(parent: Arc<dyn Database>, target_table_name: String, source_uri: String) -> Self {
|
||||
Self {
|
||||
parent,
|
||||
request: CloneTableRequest::new(target_table_name, source_uri),
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the source version to clone from
|
||||
pub fn source_version(mut self, version: u64) -> Self {
|
||||
self.request.source_version = Some(version);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the source tag to clone from
|
||||
pub fn source_tag(mut self, tag: impl Into<String>) -> Self {
|
||||
self.request.source_tag = Some(tag.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the target namespace for the cloned table
|
||||
pub fn target_namespace(mut self, namespace: Vec<String>) -> Self {
|
||||
self.request.target_namespace = namespace;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set whether to perform a shallow clone (default: true)
|
||||
///
|
||||
/// When true, the cloned table shares data files with the source table.
|
||||
/// When false, performs a deep clone (not yet implemented).
|
||||
pub fn is_shallow(mut self, is_shallow: bool) -> Self {
|
||||
self.request.is_shallow = is_shallow;
|
||||
self
|
||||
}
|
||||
|
||||
/// Execute the clone operation
|
||||
pub async fn execute(self) -> Result<Table> {
|
||||
Ok(Table::new(
|
||||
self.parent.clone().clone_table(self.request).await?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// A connection to LanceDB
|
||||
#[derive(Clone)]
|
||||
pub struct Connection {
|
||||
@@ -575,6 +631,30 @@ impl Connection {
|
||||
)
|
||||
}
|
||||
|
||||
/// Clone a table in the database
|
||||
///
|
||||
/// Creates a new table by cloning from an existing source table.
|
||||
/// By default, this performs a shallow clone where the new table shares
|
||||
/// the underlying data files with the source table.
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `target_table_name`: The name of the new table to create
|
||||
/// - `source_uri`: The URI of the source table to clone from
|
||||
///
|
||||
/// # Returns
|
||||
/// A [`CloneTableBuilder`] that can be used to configure the clone operation
|
||||
pub fn clone_table(
|
||||
&self,
|
||||
target_table_name: impl Into<String>,
|
||||
source_uri: impl Into<String>,
|
||||
) -> CloneTableBuilder {
|
||||
CloneTableBuilder::new(
|
||||
self.internal.clone(),
|
||||
target_table_name.into(),
|
||||
source_uri.into(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Rename a table in the database.
|
||||
///
|
||||
/// This is only supported in LanceDB Cloud.
|
||||
@@ -935,6 +1015,117 @@ pub fn connect(uri: &str) -> ConnectBuilder {
|
||||
ConnectBuilder::new(uri)
|
||||
}
|
||||
|
||||
pub struct ConnectNamespaceBuilder {
|
||||
ns_impl: String,
|
||||
properties: HashMap<String, String>,
|
||||
storage_options: HashMap<String, String>,
|
||||
read_consistency_interval: Option<std::time::Duration>,
|
||||
embedding_registry: Option<Arc<dyn EmbeddingRegistry>>,
|
||||
session: Option<Arc<lance::session::Session>>,
|
||||
}
|
||||
|
||||
impl ConnectNamespaceBuilder {
|
||||
fn new(ns_impl: &str, properties: HashMap<String, String>) -> Self {
|
||||
Self {
|
||||
ns_impl: ns_impl.to_string(),
|
||||
properties,
|
||||
storage_options: HashMap::new(),
|
||||
read_consistency_interval: None,
|
||||
embedding_registry: None,
|
||||
session: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set an option for the storage layer.
|
||||
///
|
||||
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
||||
pub fn storage_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
|
||||
self.storage_options.insert(key.into(), value.into());
|
||||
self
|
||||
}
|
||||
|
||||
/// Set multiple options for the storage layer.
|
||||
///
|
||||
/// See available options at <https://lancedb.github.io/lancedb/guides/storage/>
|
||||
pub fn storage_options(
|
||||
mut self,
|
||||
pairs: impl IntoIterator<Item = (impl Into<String>, impl Into<String>)>,
|
||||
) -> Self {
|
||||
for (key, value) in pairs {
|
||||
self.storage_options.insert(key.into(), value.into());
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// The interval at which to check for updates from other processes.
|
||||
///
|
||||
/// If left unset, consistency is not checked. For maximum read
|
||||
/// performance, this is the default. For strong consistency, set this to
|
||||
/// zero seconds. Then every read will check for updates from other processes.
|
||||
/// As a compromise, set this to a non-zero duration for eventual consistency.
|
||||
pub fn read_consistency_interval(
|
||||
mut self,
|
||||
read_consistency_interval: std::time::Duration,
|
||||
) -> Self {
|
||||
self.read_consistency_interval = Some(read_consistency_interval);
|
||||
self
|
||||
}
|
||||
|
||||
/// Provide a custom [`EmbeddingRegistry`] to use for this connection.
|
||||
pub fn embedding_registry(mut self, registry: Arc<dyn EmbeddingRegistry>) -> Self {
|
||||
self.embedding_registry = Some(registry);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set a custom session for object stores and caching.
|
||||
///
|
||||
/// By default, a new session with default configuration will be created.
|
||||
/// This method allows you to provide a custom session with your own
|
||||
/// configuration for object store registries, caching, etc.
|
||||
pub fn session(mut self, session: Arc<lance::session::Session>) -> Self {
|
||||
self.session = Some(session);
|
||||
self
|
||||
}
|
||||
|
||||
/// Execute the connection
|
||||
pub async fn execute(self) -> Result<Connection> {
|
||||
use crate::database::namespace::LanceNamespaceDatabase;
|
||||
|
||||
let internal = Arc::new(
|
||||
LanceNamespaceDatabase::connect(
|
||||
&self.ns_impl,
|
||||
self.properties,
|
||||
self.storage_options,
|
||||
self.read_consistency_interval,
|
||||
self.session,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
Ok(Connection {
|
||||
internal,
|
||||
uri: format!("namespace://{}", self.ns_impl),
|
||||
embedding_registry: self
|
||||
.embedding_registry
|
||||
.unwrap_or_else(|| Arc::new(MemoryRegistry::new())),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Connect to a LanceDB database through a namespace.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `ns_impl` - The namespace implementation to use (e.g., "dir" for directory-based, "rest" for REST API)
|
||||
/// * `properties` - Configuration properties for the namespace implementation
|
||||
/// ```
|
||||
pub fn connect_namespace(
|
||||
ns_impl: &str,
|
||||
properties: HashMap<String, String>,
|
||||
) -> ConnectNamespaceBuilder {
|
||||
ConnectNamespaceBuilder::new(ns_impl, properties)
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "remote"))]
|
||||
mod test_utils {
|
||||
use super::*;
|
||||
@@ -979,6 +1170,7 @@ mod tests {
|
||||
use crate::database::listing::{ListingDatabaseOptions, NewTableConfig};
|
||||
use crate::query::QueryBase;
|
||||
use crate::query::{ExecutableQuery, QueryExecutionOptions};
|
||||
use crate::test_connection::test_utils::new_test_connection;
|
||||
use arrow::compute::concat_batches;
|
||||
use arrow_array::RecordBatchReader;
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
@@ -994,11 +1186,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_connect() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let db = connect(uri).execute().await.unwrap();
|
||||
|
||||
assert_eq!(db.uri, uri);
|
||||
let tc = new_test_connection().await.unwrap();
|
||||
assert_eq!(tc.connection.uri, tc.uri);
|
||||
}
|
||||
|
||||
#[cfg(not(windows))]
|
||||
@@ -1064,16 +1253,10 @@ mod tests {
|
||||
assert_eq!(tables, names[..7]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_connect_s3() {
|
||||
// let db = Database::connect("s3://bucket/path/to/database").await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_open_table() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let db = connect(uri).execute().await.unwrap();
|
||||
let tc = new_test_connection().await.unwrap();
|
||||
let db = tc.connection;
|
||||
|
||||
assert_eq!(db.table_names().execute().await.unwrap().len(), 0);
|
||||
// open non-exist table
|
||||
@@ -1281,4 +1464,50 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(other_schema, overwritten.schema().await.unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let db = connect(uri).execute().await.unwrap();
|
||||
|
||||
// Create a source table with some data
|
||||
let mut batch_gen = BatchGenerator::new()
|
||||
.col(Box::new(IncrementingInt32::new().named("id")))
|
||||
.col(Box::new(IncrementingInt32::new().named("value")));
|
||||
let reader = batch_gen.batches(5, 100);
|
||||
|
||||
let source_table = db
|
||||
.create_table("source_table", reader)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Get the source table URI
|
||||
let source_table_path = tmp_dir.path().join("source_table.lance");
|
||||
let source_uri = source_table_path.to_str().unwrap();
|
||||
|
||||
// Clone the table
|
||||
let cloned_table = db
|
||||
.clone_table("cloned_table", source_uri)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify the cloned table exists
|
||||
let table_names = db.table_names().execute().await.unwrap();
|
||||
assert!(table_names.contains(&"source_table".to_string()));
|
||||
assert!(table_names.contains(&"cloned_table".to_string()));
|
||||
|
||||
// Verify the cloned table has the same schema
|
||||
assert_eq!(
|
||||
source_table.schema().await.unwrap(),
|
||||
cloned_table.schema().await.unwrap()
|
||||
);
|
||||
|
||||
// Verify the cloned table has the same data
|
||||
let source_count = source_table.count_rows(None).await.unwrap();
|
||||
let cloned_count = cloned_table.count_rows(None).await.unwrap();
|
||||
assert_eq!(source_count, cloned_count);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ use crate::error::Result;
|
||||
use crate::table::{BaseTable, TableDefinition, WriteOptions};
|
||||
|
||||
pub mod listing;
|
||||
pub mod namespace;
|
||||
|
||||
pub trait DatabaseOptions {
|
||||
fn serialize_into_map(&self, map: &mut HashMap<String, String>);
|
||||
@@ -176,6 +177,42 @@ impl CreateTableRequest {
|
||||
}
|
||||
}
|
||||
|
||||
/// Request to clone a table from a source table.
|
||||
///
|
||||
/// A shallow clone creates a new table that shares the underlying data files
|
||||
/// with the source table but has its own independent manifest. This allows
|
||||
/// both the source and cloned tables to evolve independently while initially
|
||||
/// sharing the same data, deletion, and index files.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CloneTableRequest {
|
||||
/// The name of the target table to create
|
||||
pub target_table_name: String,
|
||||
/// The namespace for the target table. Empty list represents root namespace.
|
||||
pub target_namespace: Vec<String>,
|
||||
/// The URI of the source table to clone from.
|
||||
pub source_uri: String,
|
||||
/// Optional version of the source table to clone.
|
||||
pub source_version: Option<u64>,
|
||||
/// Optional tag of the source table to clone.
|
||||
pub source_tag: Option<String>,
|
||||
/// Whether to perform a shallow clone (true) or deep clone (false). Defaults to true.
|
||||
/// Currently only shallow clone is supported.
|
||||
pub is_shallow: bool,
|
||||
}
|
||||
|
||||
impl CloneTableRequest {
|
||||
pub fn new(target_table_name: String, source_uri: String) -> Self {
|
||||
Self {
|
||||
target_table_name,
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The `Database` trait defines the interface for database implementations.
|
||||
///
|
||||
/// A database is responsible for managing tables and their metadata.
|
||||
@@ -193,6 +230,13 @@ pub trait Database:
|
||||
async fn table_names(&self, request: TableNamesRequest) -> Result<Vec<String>>;
|
||||
/// Create a table in the database
|
||||
async fn create_table(&self, request: CreateTableRequest) -> Result<Arc<dyn BaseTable>>;
|
||||
/// Clone a table in the database.
|
||||
///
|
||||
/// Creates a shallow clone of the source table, sharing underlying data files
|
||||
/// but with an independent manifest. Both tables can evolve separately after cloning.
|
||||
///
|
||||
/// See [`CloneTableRequest`] for detailed documentation and examples.
|
||||
async fn clone_table(&self, request: CloneTableRequest) -> Result<Arc<dyn BaseTable>>;
|
||||
/// Open a table in the database
|
||||
async fn open_table(&self, request: OpenTableRequest) -> Result<Arc<dyn BaseTable>>;
|
||||
/// Rename a table in the database
|
||||
|
||||
@@ -7,7 +7,8 @@ use std::fs::create_dir_all;
|
||||
use std::path::Path;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use lance::dataset::{ReadParams, WriteMode};
|
||||
use lance::dataset::refs::Ref;
|
||||
use lance::dataset::{builder::DatasetBuilder, ReadParams, WriteMode};
|
||||
use lance::io::{ObjectStore, ObjectStoreParams, WrappingObjectStore};
|
||||
use lance_datafusion::utils::StreamingWriteSource;
|
||||
use lance_encoding::version::LanceFileVersion;
|
||||
@@ -22,8 +23,8 @@ use crate::table::NativeTable;
|
||||
use crate::utils::validate_table_name;
|
||||
|
||||
use super::{
|
||||
BaseTable, CreateNamespaceRequest, CreateTableMode, CreateTableRequest, Database,
|
||||
DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest, OpenTableRequest,
|
||||
BaseTable, CloneTableRequest, CreateNamespaceRequest, CreateTableMode, CreateTableRequest,
|
||||
Database, DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest, OpenTableRequest,
|
||||
TableNamesRequest,
|
||||
};
|
||||
|
||||
@@ -684,6 +685,65 @@ impl Database for ListingDatabase {
|
||||
}
|
||||
}
|
||||
|
||||
async fn clone_table(&self, request: CloneTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
if !request.target_namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Namespace parameter is not supported for listing database. Only root namespace is supported.".into(),
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: support deep clone
|
||||
if !request.is_shallow {
|
||||
return Err(Error::NotSupported {
|
||||
message: "Deep clone is not yet implemented".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
validate_table_name(&request.target_table_name)?;
|
||||
|
||||
let storage_params = ObjectStoreParams {
|
||||
storage_options: Some(self.storage_options.clone()),
|
||||
..Default::default()
|
||||
};
|
||||
let read_params = ReadParams {
|
||||
store_options: Some(storage_params.clone()),
|
||||
session: Some(self.session.clone()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut source_dataset = DatasetBuilder::from_uri(&request.source_uri)
|
||||
.with_read_params(read_params.clone())
|
||||
.load()
|
||||
.await
|
||||
.map_err(|e| Error::Lance { source: e })?;
|
||||
|
||||
let version_ref = match (request.source_version, request.source_tag) {
|
||||
(Some(v), None) => Ok(Ref::Version(v)),
|
||||
(None, Some(tag)) => Ok(Ref::Tag(tag)),
|
||||
(None, None) => Ok(Ref::Version(source_dataset.version().version)),
|
||||
_ => Err(Error::InvalidInput {
|
||||
message: "Cannot specify both source_version and source_tag".to_string(),
|
||||
}),
|
||||
}?;
|
||||
|
||||
let target_uri = self.table_uri(&request.target_table_name)?;
|
||||
source_dataset
|
||||
.shallow_clone(&target_uri, version_ref, Some(storage_params))
|
||||
.await
|
||||
.map_err(|e| Error::Lance { source: e })?;
|
||||
|
||||
let cloned_table = NativeTable::open_with_params(
|
||||
&target_uri,
|
||||
&request.target_table_name,
|
||||
self.store_wrapper.clone(),
|
||||
None,
|
||||
self.read_consistency_interval,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(Arc::new(cloned_table))
|
||||
}
|
||||
|
||||
async fn open_table(&self, mut request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
if !request.namespace.is_empty() {
|
||||
return Err(Error::NotSupported {
|
||||
@@ -785,3 +845,694 @@ impl Database for ListingDatabase {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::connection::ConnectRequest;
|
||||
use crate::database::{CreateTableData, CreateTableMode, CreateTableRequest};
|
||||
use crate::table::{Table, TableDefinition};
|
||||
use arrow_array::{Int32Array, RecordBatch, StringArray};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use tempfile::tempdir;
|
||||
|
||||
async fn setup_database() -> (tempfile::TempDir, ListingDatabase) {
|
||||
let tempdir = tempdir().unwrap();
|
||||
let uri = tempdir.path().to_str().unwrap();
|
||||
|
||||
let request = ConnectRequest {
|
||||
uri: uri.to_string(),
|
||||
#[cfg(feature = "remote")]
|
||||
client_config: Default::default(),
|
||||
options: Default::default(),
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
|
||||
let db = ListingDatabase::connect_with_options(&request)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
(tempdir, db)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_basic() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table with schema
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("name", DataType::Utf8, false),
|
||||
]));
|
||||
|
||||
let source_table = db
|
||||
.create_table(CreateTableRequest {
|
||||
name: "source_table".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Empty(TableDefinition::new_from_schema(schema.clone())),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Get the source table URI
|
||||
let source_uri = db.table_uri("source_table").unwrap();
|
||||
|
||||
// Clone the table
|
||||
let cloned_table = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned_table".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri: source_uri.clone(),
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify both tables exist
|
||||
let table_names = db.table_names(TableNamesRequest::default()).await.unwrap();
|
||||
assert!(table_names.contains(&"source_table".to_string()));
|
||||
assert!(table_names.contains(&"cloned_table".to_string()));
|
||||
|
||||
// Verify schemas match
|
||||
assert_eq!(
|
||||
source_table.schema().await.unwrap(),
|
||||
cloned_table.schema().await.unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_data() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table with actual data
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("name", DataType::Utf8, false),
|
||||
]));
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![1, 2, 3])),
|
||||
Arc::new(StringArray::from(vec!["a", "b", "c"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch)],
|
||||
schema.clone(),
|
||||
));
|
||||
|
||||
let source_table = db
|
||||
.create_table(CreateTableRequest {
|
||||
name: "source_with_data".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Data(reader),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let source_uri = db.table_uri("source_with_data").unwrap();
|
||||
|
||||
// Clone the table
|
||||
let cloned_table = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned_with_data".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify data counts match
|
||||
let source_count = source_table.count_rows(None).await.unwrap();
|
||||
let cloned_count = cloned_table.count_rows(None).await.unwrap();
|
||||
assert_eq!(source_count, cloned_count);
|
||||
assert_eq!(source_count, 3);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_storage_options() {
|
||||
let tempdir = tempdir().unwrap();
|
||||
let uri = tempdir.path().to_str().unwrap();
|
||||
|
||||
// Create database with storage options
|
||||
let mut options = HashMap::new();
|
||||
options.insert("test_option".to_string(), "test_value".to_string());
|
||||
|
||||
let request = ConnectRequest {
|
||||
uri: uri.to_string(),
|
||||
#[cfg(feature = "remote")]
|
||||
client_config: Default::default(),
|
||||
options: options.clone(),
|
||||
read_consistency_interval: None,
|
||||
session: None,
|
||||
};
|
||||
|
||||
let db = ListingDatabase::connect_with_options(&request)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create source table
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
|
||||
|
||||
db.create_table(CreateTableRequest {
|
||||
name: "source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Empty(TableDefinition::new_from_schema(schema)),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let source_uri = db.table_uri("source").unwrap();
|
||||
|
||||
// Clone should work with storage options
|
||||
let cloned = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await;
|
||||
|
||||
assert!(cloned.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_deep_not_supported() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
|
||||
|
||||
db.create_table(CreateTableRequest {
|
||||
name: "source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Empty(TableDefinition::new_from_schema(schema)),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let source_uri = db.table_uri("source").unwrap();
|
||||
|
||||
// Try deep clone (should fail)
|
||||
let result = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: false, // Request deep clone
|
||||
})
|
||||
.await;
|
||||
|
||||
assert!(result.is_err());
|
||||
assert!(matches!(
|
||||
result.unwrap_err(),
|
||||
Error::NotSupported { message } if message.contains("Deep clone")
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_namespace_not_supported() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
|
||||
|
||||
db.create_table(CreateTableRequest {
|
||||
name: "source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Empty(TableDefinition::new_from_schema(schema)),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let source_uri = db.table_uri("source").unwrap();
|
||||
|
||||
// Try clone with namespace (should fail for listing database)
|
||||
let result = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned".to_string(),
|
||||
target_namespace: vec!["namespace".to_string()], // Non-empty namespace
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await;
|
||||
|
||||
assert!(result.is_err());
|
||||
assert!(matches!(
|
||||
result.unwrap_err(),
|
||||
Error::NotSupported { message } if message.contains("Namespace parameter is not supported")
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_invalid_target_name() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
|
||||
|
||||
db.create_table(CreateTableRequest {
|
||||
name: "source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Empty(TableDefinition::new_from_schema(schema)),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let source_uri = db.table_uri("source").unwrap();
|
||||
|
||||
// Try clone with invalid target name
|
||||
let result = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "invalid/name".to_string(), // Invalid name with slash
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await;
|
||||
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_source_not_found() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Try to clone from non-existent source
|
||||
let result = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri: "/nonexistent/table.lance".to_string(),
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await;
|
||||
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_version_and_tag_error() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
|
||||
|
||||
db.create_table(CreateTableRequest {
|
||||
name: "source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Empty(TableDefinition::new_from_schema(schema)),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let source_uri = db.table_uri("source").unwrap();
|
||||
|
||||
// Try clone with both version and tag (should fail)
|
||||
let result = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: Some(1),
|
||||
source_tag: Some("v1.0".to_string()),
|
||||
is_shallow: true,
|
||||
})
|
||||
.await;
|
||||
|
||||
assert!(result.is_err());
|
||||
assert!(matches!(
|
||||
result.unwrap_err(),
|
||||
Error::InvalidInput { message } if message.contains("Cannot specify both source_version and source_tag")
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_specific_version() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table with initial data
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("value", DataType::Utf8, false),
|
||||
]));
|
||||
|
||||
let batch1 = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![1, 2])),
|
||||
Arc::new(StringArray::from(vec!["a", "b"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch1)],
|
||||
schema.clone(),
|
||||
));
|
||||
|
||||
let source_table = db
|
||||
.create_table(CreateTableRequest {
|
||||
name: "versioned_source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Data(reader),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Get the initial version
|
||||
let initial_version = source_table.version().await.unwrap();
|
||||
|
||||
// Add more data to create a new version
|
||||
let batch2 = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![3, 4])),
|
||||
Arc::new(StringArray::from(vec!["c", "d"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let source_table_obj = Table::new(source_table.clone());
|
||||
source_table_obj
|
||||
.add(Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch2)],
|
||||
schema.clone(),
|
||||
)))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify source table now has 4 rows
|
||||
assert_eq!(source_table.count_rows(None).await.unwrap(), 4);
|
||||
|
||||
let source_uri = db.table_uri("versioned_source").unwrap();
|
||||
|
||||
// Clone from the initial version (should have only 2 rows)
|
||||
let cloned_table = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned_from_version".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: Some(initial_version),
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify cloned table has only the initial 2 rows
|
||||
assert_eq!(cloned_table.count_rows(None).await.unwrap(), 2);
|
||||
|
||||
// Source table should still have 4 rows
|
||||
assert_eq!(source_table.count_rows(None).await.unwrap(), 4);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_tag() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table with initial data
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("value", DataType::Utf8, false),
|
||||
]));
|
||||
|
||||
let batch1 = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![1, 2])),
|
||||
Arc::new(StringArray::from(vec!["a", "b"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch1)],
|
||||
schema.clone(),
|
||||
));
|
||||
|
||||
let source_table = db
|
||||
.create_table(CreateTableRequest {
|
||||
name: "tagged_source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Data(reader),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create a tag for the current version
|
||||
let source_table_obj = Table::new(source_table.clone());
|
||||
let mut tags = source_table_obj.tags().await.unwrap();
|
||||
tags.create("v1.0", source_table.version().await.unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Add more data after the tag
|
||||
let batch2 = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![3, 4])),
|
||||
Arc::new(StringArray::from(vec!["c", "d"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let source_table_obj = Table::new(source_table.clone());
|
||||
source_table_obj
|
||||
.add(Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch2)],
|
||||
schema.clone(),
|
||||
)))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Source table should have 4 rows
|
||||
assert_eq!(source_table.count_rows(None).await.unwrap(), 4);
|
||||
|
||||
let source_uri = db.table_uri("tagged_source").unwrap();
|
||||
|
||||
// Clone from the tag (should have only 2 rows)
|
||||
let cloned_table = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned_from_tag".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: Some("v1.0".to_string()),
|
||||
is_shallow: true,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify cloned table has only the tagged version's 2 rows
|
||||
assert_eq!(cloned_table.count_rows(None).await.unwrap(), 2);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_cloned_tables_evolve_independently() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table with initial data
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("value", DataType::Utf8, false),
|
||||
]));
|
||||
|
||||
let batch1 = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![1, 2])),
|
||||
Arc::new(StringArray::from(vec!["a", "b"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch1)],
|
||||
schema.clone(),
|
||||
));
|
||||
|
||||
let source_table = db
|
||||
.create_table(CreateTableRequest {
|
||||
name: "independent_source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Data(reader),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let source_uri = db.table_uri("independent_source").unwrap();
|
||||
|
||||
// Clone the table
|
||||
let cloned_table = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "independent_clone".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Both should start with 2 rows
|
||||
assert_eq!(source_table.count_rows(None).await.unwrap(), 2);
|
||||
assert_eq!(cloned_table.count_rows(None).await.unwrap(), 2);
|
||||
|
||||
// Add data to the cloned table
|
||||
let batch_clone = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![3, 4, 5])),
|
||||
Arc::new(StringArray::from(vec!["c", "d", "e"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let cloned_table_obj = Table::new(cloned_table.clone());
|
||||
cloned_table_obj
|
||||
.add(Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch_clone)],
|
||||
schema.clone(),
|
||||
)))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Add different data to the source table
|
||||
let batch_source = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![
|
||||
Arc::new(Int32Array::from(vec![10, 11])),
|
||||
Arc::new(StringArray::from(vec!["x", "y"])),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let source_table_obj = Table::new(source_table.clone());
|
||||
source_table_obj
|
||||
.add(Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch_source)],
|
||||
schema.clone(),
|
||||
)))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify they have evolved independently
|
||||
assert_eq!(source_table.count_rows(None).await.unwrap(), 4); // 2 + 2
|
||||
assert_eq!(cloned_table.count_rows(None).await.unwrap(), 5); // 2 + 3
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_latest_version() {
|
||||
let (_tempdir, db) = setup_database().await;
|
||||
|
||||
// Create a source table with initial data
|
||||
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
|
||||
|
||||
let batch1 =
|
||||
RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(vec![1, 2]))])
|
||||
.unwrap();
|
||||
|
||||
let reader = Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch1)],
|
||||
schema.clone(),
|
||||
));
|
||||
|
||||
let source_table = db
|
||||
.create_table(CreateTableRequest {
|
||||
name: "latest_version_source".to_string(),
|
||||
namespace: vec![],
|
||||
data: CreateTableData::Data(reader),
|
||||
mode: CreateTableMode::Create,
|
||||
write_options: Default::default(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Add more data to create new versions
|
||||
for i in 0..3 {
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![Arc::new(Int32Array::from(vec![i * 10, i * 10 + 1]))],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let source_table_obj = Table::new(source_table.clone());
|
||||
source_table_obj
|
||||
.add(Box::new(arrow_array::RecordBatchIterator::new(
|
||||
vec![Ok(batch)],
|
||||
schema.clone(),
|
||||
)))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
// Source should have 8 rows total (2 + 2 + 2 + 2)
|
||||
let source_count = source_table.count_rows(None).await.unwrap();
|
||||
assert_eq!(source_count, 8);
|
||||
|
||||
let source_uri = db.table_uri("latest_version_source").unwrap();
|
||||
|
||||
// Clone without specifying version or tag (should get latest)
|
||||
let cloned_table = db
|
||||
.clone_table(CloneTableRequest {
|
||||
target_table_name: "cloned_latest".to_string(),
|
||||
target_namespace: vec![],
|
||||
source_uri,
|
||||
source_version: None,
|
||||
source_tag: None,
|
||||
is_shallow: true,
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Cloned table should have all 8 rows from the latest version
|
||||
assert_eq!(cloned_table.count_rows(None).await.unwrap(), 8);
|
||||
}
|
||||
}
|
||||
|
||||
840
rust/lancedb/src/database/namespace.rs
Normal file
840
rust/lancedb/src/database/namespace.rs
Normal file
@@ -0,0 +1,840 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
//! Namespace-based database implementation that delegates table management to lance-namespace
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use lance_namespace::{
|
||||
connect as connect_namespace,
|
||||
models::{
|
||||
CreateEmptyTableRequest, CreateNamespaceRequest, DescribeTableRequest,
|
||||
DropNamespaceRequest, DropTableRequest, ListNamespacesRequest, ListTablesRequest,
|
||||
},
|
||||
LanceNamespace,
|
||||
};
|
||||
|
||||
use crate::connection::ConnectRequest;
|
||||
use crate::database::listing::ListingDatabase;
|
||||
use crate::error::{Error, Result};
|
||||
|
||||
use super::{
|
||||
BaseTable, CloneTableRequest, CreateNamespaceRequest as DbCreateNamespaceRequest,
|
||||
CreateTableMode, CreateTableRequest as DbCreateTableRequest, Database,
|
||||
DropNamespaceRequest as DbDropNamespaceRequest,
|
||||
ListNamespacesRequest as DbListNamespacesRequest, OpenTableRequest, TableNamesRequest,
|
||||
};
|
||||
|
||||
/// A database implementation that uses lance-namespace for table management
|
||||
pub struct LanceNamespaceDatabase {
|
||||
namespace: Arc<dyn LanceNamespace>,
|
||||
// Storage options to be inherited by tables
|
||||
storage_options: HashMap<String, String>,
|
||||
// Read consistency interval for tables
|
||||
read_consistency_interval: Option<std::time::Duration>,
|
||||
// Optional session for object stores and caching
|
||||
session: Option<Arc<lance::session::Session>>,
|
||||
}
|
||||
|
||||
impl LanceNamespaceDatabase {
|
||||
pub async fn connect(
|
||||
ns_impl: &str,
|
||||
ns_properties: HashMap<String, String>,
|
||||
storage_options: HashMap<String, String>,
|
||||
read_consistency_interval: Option<std::time::Duration>,
|
||||
session: Option<Arc<lance::session::Session>>,
|
||||
) -> Result<Self> {
|
||||
let namespace = connect_namespace(ns_impl, ns_properties.clone())
|
||||
.await
|
||||
.map_err(|e| Error::InvalidInput {
|
||||
message: format!("Failed to connect to namespace: {:?}", e),
|
||||
})?;
|
||||
|
||||
Ok(Self {
|
||||
namespace,
|
||||
storage_options,
|
||||
read_consistency_interval,
|
||||
session,
|
||||
})
|
||||
}
|
||||
|
||||
/// Helper method to create a ListingDatabase from a table location
|
||||
///
|
||||
/// This method:
|
||||
/// 1. Validates that the location ends with <table_name>.lance
|
||||
/// 2. Extracts the parent directory from the location
|
||||
/// 3. Creates a ListingDatabase at that parent directory
|
||||
async fn create_listing_database(
|
||||
&self,
|
||||
table_name: &str,
|
||||
location: &str,
|
||||
additional_storage_options: Option<HashMap<String, String>>,
|
||||
) -> Result<Arc<ListingDatabase>> {
|
||||
let expected_suffix = format!("{}.lance", table_name);
|
||||
if !location.ends_with(&expected_suffix) {
|
||||
return Err(Error::Runtime {
|
||||
message: format!(
|
||||
"Invalid table location '{}': expected to end with '{}'",
|
||||
location, expected_suffix
|
||||
),
|
||||
});
|
||||
}
|
||||
|
||||
let parent_dir = location
|
||||
.rsplit_once('/')
|
||||
.map(|(parent, _)| parent.to_string())
|
||||
.ok_or_else(|| Error::Runtime {
|
||||
message: format!("Invalid table location '{}': no parent directory", location),
|
||||
})?;
|
||||
|
||||
let mut merged_storage_options = self.storage_options.clone();
|
||||
if let Some(opts) = additional_storage_options {
|
||||
merged_storage_options.extend(opts);
|
||||
}
|
||||
|
||||
let connect_request = ConnectRequest {
|
||||
uri: parent_dir,
|
||||
options: merged_storage_options,
|
||||
read_consistency_interval: self.read_consistency_interval,
|
||||
session: self.session.clone(),
|
||||
#[cfg(feature = "remote")]
|
||||
client_config: Default::default(),
|
||||
};
|
||||
|
||||
let listing_db = ListingDatabase::connect_with_options(&connect_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to create listing database: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(Arc::new(listing_db))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for LanceNamespaceDatabase {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("LanceNamespaceDatabase")
|
||||
.field("storage_options", &self.storage_options)
|
||||
.field("read_consistency_interval", &self.read_consistency_interval)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for LanceNamespaceDatabase {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "LanceNamespaceDatabase")
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Database for LanceNamespaceDatabase {
|
||||
async fn list_namespaces(&self, request: DbListNamespacesRequest) -> Result<Vec<String>> {
|
||||
let ns_request = ListNamespacesRequest {
|
||||
id: if request.namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(request.namespace)
|
||||
},
|
||||
page_token: request.page_token,
|
||||
limit: request.limit.map(|l| l as i32),
|
||||
};
|
||||
|
||||
let response = self
|
||||
.namespace
|
||||
.list_namespaces(ns_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to list namespaces: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(response.namespaces)
|
||||
}
|
||||
|
||||
async fn create_namespace(&self, request: DbCreateNamespaceRequest) -> Result<()> {
|
||||
let ns_request = CreateNamespaceRequest {
|
||||
id: if request.namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(request.namespace)
|
||||
},
|
||||
mode: None,
|
||||
properties: None,
|
||||
};
|
||||
|
||||
self.namespace
|
||||
.create_namespace(ns_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to create namespace: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn drop_namespace(&self, request: DbDropNamespaceRequest) -> Result<()> {
|
||||
let ns_request = DropNamespaceRequest {
|
||||
id: if request.namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(request.namespace)
|
||||
},
|
||||
mode: None,
|
||||
behavior: None,
|
||||
};
|
||||
|
||||
self.namespace
|
||||
.drop_namespace(ns_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to drop namespace: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn table_names(&self, request: TableNamesRequest) -> Result<Vec<String>> {
|
||||
let ns_request = ListTablesRequest {
|
||||
id: if request.namespace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(request.namespace)
|
||||
},
|
||||
page_token: request.start_after,
|
||||
limit: request.limit.map(|l| l as i32),
|
||||
};
|
||||
|
||||
let response =
|
||||
self.namespace
|
||||
.list_tables(ns_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to list tables: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(response.tables)
|
||||
}
|
||||
|
||||
async fn create_table(&self, request: DbCreateTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
let mut table_id = request.namespace.clone();
|
||||
table_id.push(request.name.clone());
|
||||
let describe_request = DescribeTableRequest {
|
||||
id: Some(table_id.clone()),
|
||||
version: None,
|
||||
};
|
||||
|
||||
let describe_result = self.namespace.describe_table(describe_request).await;
|
||||
|
||||
match request.mode {
|
||||
CreateTableMode::Create => {
|
||||
if describe_result.is_ok() {
|
||||
return Err(Error::TableAlreadyExists {
|
||||
name: request.name.clone(),
|
||||
});
|
||||
}
|
||||
}
|
||||
CreateTableMode::Overwrite => {
|
||||
if describe_result.is_ok() {
|
||||
// Drop the existing table - must succeed
|
||||
let drop_request = DropTableRequest {
|
||||
id: Some(table_id.clone()),
|
||||
};
|
||||
self.namespace
|
||||
.drop_table(drop_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to drop existing table for overwrite: {}", e),
|
||||
})?;
|
||||
}
|
||||
}
|
||||
CreateTableMode::ExistOk(_) => {
|
||||
if let Ok(response) = describe_result {
|
||||
let location = response.location.ok_or_else(|| Error::Runtime {
|
||||
message: "Table location is missing from namespace response".to_string(),
|
||||
})?;
|
||||
|
||||
let listing_db = self
|
||||
.create_listing_database(&request.name, &location, response.storage_options)
|
||||
.await?;
|
||||
|
||||
return listing_db
|
||||
.open_table(OpenTableRequest {
|
||||
name: request.name.clone(),
|
||||
namespace: request.namespace.clone(),
|
||||
index_cache_size: None,
|
||||
lance_read_params: None,
|
||||
})
|
||||
.await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut table_id = request.namespace.clone();
|
||||
table_id.push(request.name.clone());
|
||||
|
||||
let create_empty_request = CreateEmptyTableRequest {
|
||||
id: Some(table_id),
|
||||
location: None,
|
||||
properties: if self.storage_options.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(self.storage_options.clone())
|
||||
},
|
||||
};
|
||||
|
||||
let create_empty_response = self
|
||||
.namespace
|
||||
.create_empty_table(create_empty_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to create empty table: {}", e),
|
||||
})?;
|
||||
|
||||
let location = create_empty_response
|
||||
.location
|
||||
.ok_or_else(|| Error::Runtime {
|
||||
message: "Table location is missing from create_empty_table response".to_string(),
|
||||
})?;
|
||||
|
||||
let listing_db = self
|
||||
.create_listing_database(
|
||||
&request.name,
|
||||
&location,
|
||||
create_empty_response.storage_options,
|
||||
)
|
||||
.await?;
|
||||
|
||||
listing_db.create_table(request).await
|
||||
}
|
||||
|
||||
async fn open_table(&self, request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
let mut table_id = request.namespace.clone();
|
||||
table_id.push(request.name.clone());
|
||||
|
||||
let describe_request = DescribeTableRequest {
|
||||
id: Some(table_id),
|
||||
version: None,
|
||||
};
|
||||
let response = self
|
||||
.namespace
|
||||
.describe_table(describe_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to describe table: {}", e),
|
||||
})?;
|
||||
|
||||
let location = response.location.ok_or_else(|| Error::Runtime {
|
||||
message: "Table location is missing from namespace response".to_string(),
|
||||
})?;
|
||||
|
||||
let listing_db = self
|
||||
.create_listing_database(&request.name, &location, response.storage_options)
|
||||
.await?;
|
||||
|
||||
listing_db.open_table(request).await
|
||||
}
|
||||
|
||||
async fn clone_table(&self, _request: CloneTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
Err(Error::NotSupported {
|
||||
message: "clone_table is not supported for namespace connections".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn rename_table(
|
||||
&self,
|
||||
_cur_name: &str,
|
||||
_new_name: &str,
|
||||
_cur_namespace: &[String],
|
||||
_new_namespace: &[String],
|
||||
) -> Result<()> {
|
||||
Err(Error::NotSupported {
|
||||
message: "rename_table is not supported for namespace connections".to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn drop_table(&self, name: &str, namespace: &[String]) -> Result<()> {
|
||||
let mut table_id = namespace.to_vec();
|
||||
table_id.push(name.to_string());
|
||||
|
||||
let drop_request = DropTableRequest { id: Some(table_id) };
|
||||
self.namespace
|
||||
.drop_table(drop_request)
|
||||
.await
|
||||
.map_err(|e| Error::Runtime {
|
||||
message: format!("Failed to drop table: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn drop_all_tables(&self, namespace: &[String]) -> Result<()> {
|
||||
let tables = self
|
||||
.table_names(TableNamesRequest {
|
||||
namespace: namespace.to_vec(),
|
||||
start_after: None,
|
||||
limit: None,
|
||||
})
|
||||
.await?;
|
||||
|
||||
for table in tables {
|
||||
self.drop_table(&table, namespace).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn as_any(&self) -> &dyn std::any::Any {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[cfg(not(windows))] // TODO: support windows for lance-namespace
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::connect_namespace;
|
||||
use crate::query::ExecutableQuery;
|
||||
use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator, StringArray};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use futures::TryStreamExt;
|
||||
use tempfile::tempdir;
|
||||
|
||||
/// Helper function to create test data
|
||||
fn create_test_data() -> RecordBatchIterator<
|
||||
std::vec::IntoIter<std::result::Result<RecordBatch, arrow_schema::ArrowError>>,
|
||||
> {
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("name", DataType::Utf8, false),
|
||||
]));
|
||||
|
||||
let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
|
||||
let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie", "David", "Eve"]);
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![Arc::new(id_array), Arc::new(name_array)],
|
||||
)
|
||||
.unwrap();
|
||||
RecordBatchIterator::new(vec![std::result::Result::Ok(batch)].into_iter(), schema)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_connection_simple() {
|
||||
// Test that namespace connections work with simple connect_namespace(impl_type, properties)
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
// This should succeed with directory-based namespace
|
||||
let result = connect_namespace("dir", properties).execute().await;
|
||||
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_connection_with_storage_options() {
|
||||
// Test namespace connections with storage options
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
// This should succeed with directory-based namespace and storage options
|
||||
let result = connect_namespace("dir", properties)
|
||||
.storage_option("timeout", "30s")
|
||||
.execute()
|
||||
.await;
|
||||
|
||||
assert!(result.is_ok());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_connection_with_all_options() {
|
||||
use crate::embeddings::MemoryRegistry;
|
||||
use std::time::Duration;
|
||||
|
||||
// Test namespace connections with all configuration options
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let embedding_registry = Arc::new(MemoryRegistry::new());
|
||||
let session = Arc::new(lance::session::Session::default());
|
||||
|
||||
// Test with all options set
|
||||
let result = connect_namespace("dir", properties)
|
||||
.storage_option("timeout", "30s")
|
||||
.storage_options([("cache_size", "1gb"), ("region", "us-east-1")])
|
||||
.read_consistency_interval(Duration::from_secs(5))
|
||||
.embedding_registry(embedding_registry.clone())
|
||||
.session(session.clone())
|
||||
.execute()
|
||||
.await;
|
||||
|
||||
assert!(result.is_ok());
|
||||
|
||||
let conn = result.unwrap();
|
||||
|
||||
// Verify embedding registry is set correctly
|
||||
assert!(std::ptr::eq(
|
||||
conn.embedding_registry() as *const _,
|
||||
embedding_registry.as_ref() as *const _
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_create_table_basic() {
|
||||
// Setup: Create a temporary directory for the namespace
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
// Connect to namespace using DirectoryNamespace
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
// Test: Create a table
|
||||
let test_data = create_test_data();
|
||||
let table = conn
|
||||
.create_table("test_table", test_data)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create table");
|
||||
|
||||
// Verify: Table was created and can be queried
|
||||
let results = table
|
||||
.query()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to query table")
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.expect("Failed to collect results");
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].num_rows(), 5);
|
||||
|
||||
// Verify: Table appears in table_names
|
||||
let table_names = conn
|
||||
.table_names()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to list tables");
|
||||
assert!(table_names.contains(&"test_table".to_string()));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_describe_table() {
|
||||
// Setup: Create a temporary directory for the namespace
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
// Connect to namespace
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
// Create a table first
|
||||
let test_data = create_test_data();
|
||||
let _table = conn
|
||||
.create_table("describe_test", test_data)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create table");
|
||||
|
||||
// Test: Open the table (which internally uses describe_table)
|
||||
let opened_table = conn
|
||||
.open_table("describe_test")
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to open table");
|
||||
|
||||
// Verify: Can query the opened table
|
||||
let results = opened_table
|
||||
.query()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to query table")
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.expect("Failed to collect results");
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].num_rows(), 5);
|
||||
|
||||
// Verify schema matches
|
||||
let schema = opened_table.schema().await.expect("Failed to get schema");
|
||||
assert_eq!(schema.fields.len(), 2);
|
||||
assert_eq!(schema.field(0).name(), "id");
|
||||
assert_eq!(schema.field(1).name(), "name");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_create_table_overwrite_mode() {
|
||||
// Setup: Create a temporary directory for the namespace
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
// Create initial table with 5 rows
|
||||
let test_data1 = create_test_data();
|
||||
let _table1 = conn
|
||||
.create_table("overwrite_test", test_data1)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create table");
|
||||
|
||||
// Create new data with 3 rows
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("name", DataType::Utf8, false),
|
||||
]));
|
||||
let id_array = Int32Array::from(vec![10, 20, 30]);
|
||||
let name_array = StringArray::from(vec!["New1", "New2", "New3"]);
|
||||
let test_data2 = RecordBatch::try_new(
|
||||
schema.clone(),
|
||||
vec![Arc::new(id_array), Arc::new(name_array)],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Test: Overwrite the table
|
||||
let table2 = conn
|
||||
.create_table(
|
||||
"overwrite_test",
|
||||
RecordBatchIterator::new(
|
||||
vec![std::result::Result::Ok(test_data2)].into_iter(),
|
||||
schema,
|
||||
),
|
||||
)
|
||||
.mode(CreateTableMode::Overwrite)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to overwrite table");
|
||||
|
||||
// Verify: Table has new data (3 rows instead of 5)
|
||||
let results = table2
|
||||
.query()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to query table")
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.expect("Failed to collect results");
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].num_rows(), 3);
|
||||
|
||||
// Verify the data is actually the new data
|
||||
let id_col = results[0]
|
||||
.column(0)
|
||||
.as_any()
|
||||
.downcast_ref::<Int32Array>()
|
||||
.unwrap();
|
||||
assert_eq!(id_col.value(0), 10);
|
||||
assert_eq!(id_col.value(1), 20);
|
||||
assert_eq!(id_col.value(2), 30);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_create_table_exist_ok_mode() {
|
||||
// Setup: Create a temporary directory for the namespace
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
// Create initial table with test data
|
||||
let test_data1 = create_test_data();
|
||||
let _table1 = conn
|
||||
.create_table("exist_ok_test", test_data1)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create table");
|
||||
|
||||
// Try to create again with exist_ok mode
|
||||
let test_data2 = create_test_data();
|
||||
let table2 = conn
|
||||
.create_table("exist_ok_test", test_data2)
|
||||
.mode(CreateTableMode::exist_ok(|req| req))
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed with exist_ok mode");
|
||||
|
||||
// Verify: Table still has original data (5 rows)
|
||||
let results = table2
|
||||
.query()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to query table")
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.expect("Failed to collect results");
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].num_rows(), 5);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_create_multiple_tables() {
|
||||
// Setup: Create a temporary directory for the namespace
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
// Create first table
|
||||
let test_data1 = create_test_data();
|
||||
let _table1 = conn
|
||||
.create_table("table1", test_data1)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create first table");
|
||||
|
||||
// Create second table
|
||||
let test_data2 = create_test_data();
|
||||
let _table2 = conn
|
||||
.create_table("table2", test_data2)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create second table");
|
||||
|
||||
// Verify: Both tables appear in table list
|
||||
let table_names = conn
|
||||
.table_names()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to list tables");
|
||||
|
||||
assert!(table_names.contains(&"table1".to_string()));
|
||||
assert!(table_names.contains(&"table2".to_string()));
|
||||
|
||||
// Verify: Can open both tables
|
||||
let opened_table1 = conn
|
||||
.open_table("table1")
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to open table1");
|
||||
|
||||
let opened_table2 = conn
|
||||
.open_table("table2")
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to open table2");
|
||||
|
||||
// Verify both tables work
|
||||
let count1 = opened_table1
|
||||
.count_rows(None)
|
||||
.await
|
||||
.expect("Failed to count rows in table1");
|
||||
assert_eq!(count1, 5);
|
||||
|
||||
let count2 = opened_table2
|
||||
.count_rows(None)
|
||||
.await
|
||||
.expect("Failed to count rows in table2");
|
||||
assert_eq!(count2, 5);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_table_not_found() {
|
||||
// Setup: Create a temporary directory for the namespace
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
// Test: Try to open a non-existent table
|
||||
let result = conn.open_table("non_existent_table").execute().await;
|
||||
|
||||
// Verify: Should return an error
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_namespace_drop_table() {
|
||||
// Setup: Create a temporary directory for the namespace
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let root_path = tmp_dir.path().to_str().unwrap().to_string();
|
||||
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert("root".to_string(), root_path);
|
||||
|
||||
let conn = connect_namespace("dir", properties)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to connect to namespace");
|
||||
|
||||
// Create a table first
|
||||
let test_data = create_test_data();
|
||||
let _table = conn
|
||||
.create_table("drop_test", test_data)
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to create table");
|
||||
|
||||
// Verify table exists
|
||||
let table_names_before = conn
|
||||
.table_names()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to list tables");
|
||||
assert!(table_names_before.contains(&"drop_test".to_string()));
|
||||
|
||||
// Test: Drop the table
|
||||
conn.drop_table("drop_test", &[])
|
||||
.await
|
||||
.expect("Failed to drop table");
|
||||
|
||||
// Verify: Table no longer exists
|
||||
let table_names_after = conn
|
||||
.table_names()
|
||||
.execute()
|
||||
.await
|
||||
.expect("Failed to list tables");
|
||||
assert!(!table_names_after.contains(&"drop_test".to_string()));
|
||||
|
||||
// Verify: Cannot open dropped table
|
||||
let open_result = conn.open_table("drop_test").execute().await;
|
||||
assert!(open_result.is_err());
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@
|
||||
//! values
|
||||
use std::cmp::max;
|
||||
|
||||
use lance::table::format::{Index, Manifest};
|
||||
use lance::table::format::{IndexMetadata, Manifest};
|
||||
|
||||
use crate::DistanceType;
|
||||
|
||||
@@ -19,7 +19,7 @@ pub struct VectorIndex {
|
||||
}
|
||||
|
||||
impl VectorIndex {
|
||||
pub fn new_from_format(manifest: &Manifest, index: &Index) -> Self {
|
||||
pub fn new_from_format(manifest: &Manifest, index: &IndexMetadata) -> Self {
|
||||
let fields = index
|
||||
.fields
|
||||
.iter()
|
||||
|
||||
@@ -206,13 +206,14 @@ pub mod query;
|
||||
pub mod remote;
|
||||
pub mod rerankers;
|
||||
pub mod table;
|
||||
pub mod test_connection;
|
||||
pub mod utils;
|
||||
|
||||
use std::fmt::Display;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use connection::Connection;
|
||||
pub use connection::{ConnectNamespaceBuilder, Connection};
|
||||
pub use error::{Error, Result};
|
||||
use lance_linalg::distance::DistanceType as LanceDistanceType;
|
||||
pub use table::Table;
|
||||
@@ -289,6 +290,8 @@ impl Display for DistanceType {
|
||||
|
||||
/// Connect to a database
|
||||
pub use connection::connect;
|
||||
/// Connect to a namespace-backed database
|
||||
pub use connection::connect_namespace;
|
||||
|
||||
/// Re-export Lance Session and ObjectStoreRegistry for custom session creation
|
||||
pub use lance::session::Session;
|
||||
|
||||
@@ -14,9 +14,9 @@ use serde::Deserialize;
|
||||
use tokio::task::spawn_blocking;
|
||||
|
||||
use crate::database::{
|
||||
CreateNamespaceRequest, CreateTableData, CreateTableMode, CreateTableRequest, Database,
|
||||
DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest, OpenTableRequest,
|
||||
TableNamesRequest,
|
||||
CloneTableRequest, CreateNamespaceRequest, CreateTableData, CreateTableMode,
|
||||
CreateTableRequest, Database, DatabaseOptions, DropNamespaceRequest, ListNamespacesRequest,
|
||||
OpenTableRequest, TableNamesRequest,
|
||||
};
|
||||
use crate::error::Result;
|
||||
use crate::table::BaseTable;
|
||||
@@ -27,6 +27,18 @@ use super::table::RemoteTable;
|
||||
use super::util::{batches_to_ipc_bytes, parse_server_version};
|
||||
use super::ARROW_STREAM_CONTENT_TYPE;
|
||||
|
||||
// Request structure for the remote clone table API
|
||||
#[derive(serde::Serialize)]
|
||||
struct RemoteCloneTableRequest {
|
||||
source_location: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
source_version: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
source_tag: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
is_shallow: Option<bool>,
|
||||
}
|
||||
|
||||
// the versions of the server that we support
|
||||
// for any new feature that we need to change the SDK behavior, we should bump the server version,
|
||||
// and add a feature flag as method of `ServerVersion` here.
|
||||
@@ -430,6 +442,51 @@ impl<S: HttpSend> Database for RemoteDatabase<S> {
|
||||
Ok(table)
|
||||
}
|
||||
|
||||
async fn clone_table(&self, request: CloneTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
let table_identifier = build_table_identifier(
|
||||
&request.target_table_name,
|
||||
&request.target_namespace,
|
||||
&self.client.id_delimiter,
|
||||
);
|
||||
|
||||
let remote_request = RemoteCloneTableRequest {
|
||||
source_location: request.source_uri,
|
||||
source_version: request.source_version,
|
||||
source_tag: request.source_tag,
|
||||
is_shallow: Some(request.is_shallow),
|
||||
};
|
||||
|
||||
let req = self
|
||||
.client
|
||||
.post(&format!("/v1/table/{}/clone", table_identifier.clone()))
|
||||
.json(&remote_request);
|
||||
|
||||
let (request_id, rsp) = self.client.send(req).await?;
|
||||
|
||||
let status = rsp.status();
|
||||
if status != StatusCode::OK {
|
||||
let body = rsp.text().await.err_to_http(request_id.clone())?;
|
||||
return Err(crate::Error::Http {
|
||||
source: format!("Failed to clone table: {}", body).into(),
|
||||
request_id,
|
||||
status_code: Some(status),
|
||||
});
|
||||
}
|
||||
|
||||
let version = parse_server_version(&request_id, &rsp)?;
|
||||
let cache_key = build_cache_key(&request.target_table_name, &request.target_namespace);
|
||||
let table = Arc::new(RemoteTable::new(
|
||||
self.client.clone(),
|
||||
request.target_table_name.clone(),
|
||||
request.target_namespace.clone(),
|
||||
table_identifier,
|
||||
version,
|
||||
));
|
||||
self.table_cache.insert(cache_key, table.clone()).await;
|
||||
|
||||
Ok(table)
|
||||
}
|
||||
|
||||
async fn open_table(&self, request: OpenTableRequest) -> Result<Arc<dyn BaseTable>> {
|
||||
let identifier =
|
||||
build_table_identifier(&request.name, &request.namespace, &self.client.id_delimiter);
|
||||
@@ -1221,4 +1278,146 @@ mod tests {
|
||||
_ => panic!("Expected Runtime error from header provider"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/cloned_table/clone");
|
||||
assert_eq!(
|
||||
request.headers().get("Content-Type").unwrap(),
|
||||
JSON_CONTENT_TYPE
|
||||
);
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
assert_eq!(body["source_location"], "s3://bucket/source_table");
|
||||
assert_eq!(body["is_shallow"], true);
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
|
||||
let table = conn
|
||||
.clone_table("cloned_table", "s3://bucket/source_table")
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.name(), "cloned_table");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_version() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/cloned_table/clone");
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
assert_eq!(body["source_location"], "s3://bucket/source_table");
|
||||
assert_eq!(body["source_version"], 42);
|
||||
assert_eq!(body["is_shallow"], true);
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
|
||||
let table = conn
|
||||
.clone_table("cloned_table", "s3://bucket/source_table")
|
||||
.source_version(42)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.name(), "cloned_table");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_tag() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/cloned_table/clone");
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
assert_eq!(body["source_location"], "s3://bucket/source_table");
|
||||
assert_eq!(body["source_tag"], "v1.0");
|
||||
assert_eq!(body["is_shallow"], true);
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
|
||||
let table = conn
|
||||
.clone_table("cloned_table", "s3://bucket/source_table")
|
||||
.source_tag("v1.0")
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.name(), "cloned_table");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_deep_clone() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/cloned_table/clone");
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
assert_eq!(body["source_location"], "s3://bucket/source_table");
|
||||
assert_eq!(body["is_shallow"], false);
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
|
||||
let table = conn
|
||||
.clone_table("cloned_table", "s3://bucket/source_table")
|
||||
.is_shallow(false)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.name(), "cloned_table");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_with_namespace() {
|
||||
let conn = Connection::new_with_handler(|request| {
|
||||
assert_eq!(request.method(), &reqwest::Method::POST);
|
||||
assert_eq!(request.url().path(), "/v1/table/ns1$ns2$cloned_table/clone");
|
||||
|
||||
let body = request.body().unwrap().as_bytes().unwrap();
|
||||
let body: serde_json::Value = serde_json::from_slice(body).unwrap();
|
||||
assert_eq!(body["source_location"], "s3://bucket/source_table");
|
||||
assert_eq!(body["is_shallow"], true);
|
||||
|
||||
http::Response::builder().status(200).body("").unwrap()
|
||||
});
|
||||
|
||||
let table = conn
|
||||
.clone_table("cloned_table", "s3://bucket/source_table")
|
||||
.target_namespace(vec!["ns1".to_string(), "ns2".to_string()])
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.name(), "cloned_table");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_clone_table_error() {
|
||||
let conn = Connection::new_with_handler(|_| {
|
||||
http::Response::builder()
|
||||
.status(500)
|
||||
.body("Internal server error")
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let result = conn
|
||||
.clone_table("cloned_table", "s3://bucket/source_table")
|
||||
.execute()
|
||||
.await;
|
||||
|
||||
assert!(result.is_err());
|
||||
if let Err(crate::Error::Http { source, .. }) = result {
|
||||
assert!(source.to_string().contains("Failed to clone table"));
|
||||
} else {
|
||||
panic!("Expected HTTP error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1452,6 +1452,14 @@ struct MergeInsertRequest {
|
||||
when_not_matched_insert_all: bool,
|
||||
when_not_matched_by_source_delete: bool,
|
||||
when_not_matched_by_source_delete_filt: Option<String>,
|
||||
// For backwards compatibility, only serialize use_index when it's false
|
||||
// (the default is true)
|
||||
#[serde(skip_serializing_if = "is_true")]
|
||||
use_index: bool,
|
||||
}
|
||||
|
||||
fn is_true(b: &bool) -> bool {
|
||||
*b
|
||||
}
|
||||
|
||||
impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
|
||||
@@ -1476,6 +1484,8 @@ impl TryFrom<MergeInsertBuilder> for MergeInsertRequest {
|
||||
when_not_matched_insert_all: value.when_not_matched_insert_all,
|
||||
when_not_matched_by_source_delete: value.when_not_matched_by_source_delete,
|
||||
when_not_matched_by_source_delete_filt: value.when_not_matched_by_source_delete_filt,
|
||||
// Only serialize use_index when it's false for backwards compatibility
|
||||
use_index: value.use_index,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1942,6 +1952,7 @@ mod tests {
|
||||
assert_eq!(params["when_not_matched_by_source_delete"], "false");
|
||||
assert!(!params.contains_key("when_matched_update_all_filt"));
|
||||
assert!(!params.contains_key("when_not_matched_by_source_delete_filt"));
|
||||
assert!(!params.contains_key("use_index"));
|
||||
|
||||
if old_server {
|
||||
http::Response::builder().status(200).body("{}").unwrap()
|
||||
|
||||
@@ -1976,6 +1976,8 @@ impl NativeTable {
|
||||
/// Delete keys from the config
|
||||
pub async fn delete_config_keys(&self, delete_keys: &[&str]) -> Result<()> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
// TODO: update this when we implement metadata APIs
|
||||
#[allow(deprecated)]
|
||||
dataset.delete_config_keys(delete_keys).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1986,6 +1988,8 @@ impl NativeTable {
|
||||
upsert_values: impl IntoIterator<Item = (String, String)>,
|
||||
) -> Result<()> {
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
// TODO: update this when we implement metadata APIs
|
||||
#[allow(deprecated)]
|
||||
dataset.replace_schema_metadata(upsert_values).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -2395,6 +2399,7 @@ impl BaseTable for NativeTable {
|
||||
} else {
|
||||
builder.when_not_matched_by_source(WhenNotMatchedBySource::Keep);
|
||||
}
|
||||
builder.use_index(params.use_index);
|
||||
|
||||
let future = if let Some(timeout) = params.timeout {
|
||||
// The default retry timeout is 30s, so we pass the full timeout down
|
||||
@@ -2755,6 +2760,7 @@ mod tests {
|
||||
RecordBatchReader, StringArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
||||
UInt32Array,
|
||||
};
|
||||
use arrow_array::{BinaryArray, LargeBinaryArray};
|
||||
use arrow_data::ArrayDataBuilder;
|
||||
use arrow_schema::{DataType, Field, Schema, TimeUnit};
|
||||
use futures::TryStreamExt;
|
||||
@@ -2902,6 +2908,38 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_merge_insert_use_index() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
let uri = tmp_dir.path().to_str().unwrap();
|
||||
let conn = connect(uri).execute().await.unwrap();
|
||||
|
||||
// Create a dataset with i=0..10
|
||||
let batches = merge_insert_test_batches(0, 0);
|
||||
let table = conn
|
||||
.create_table("my_table", batches)
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(table.count_rows(None).await.unwrap(), 10);
|
||||
|
||||
// Test use_index=true (default behavior)
|
||||
let new_batches = Box::new(merge_insert_test_batches(5, 1));
|
||||
let mut merge_insert_builder = table.merge_insert(&["i"]);
|
||||
merge_insert_builder.when_not_matched_insert_all();
|
||||
merge_insert_builder.use_index(true);
|
||||
merge_insert_builder.execute(new_batches).await.unwrap();
|
||||
assert_eq!(table.count_rows(None).await.unwrap(), 15);
|
||||
|
||||
// Test use_index=false (force table scan)
|
||||
let new_batches = Box::new(merge_insert_test_batches(15, 2));
|
||||
let mut merge_insert_builder = table.merge_insert(&["i"]);
|
||||
merge_insert_builder.when_not_matched_insert_all();
|
||||
merge_insert_builder.use_index(false);
|
||||
merge_insert_builder.execute(new_batches).await.unwrap();
|
||||
assert_eq!(table.count_rows(None).await.unwrap(), 25);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_add_overwrite() {
|
||||
let tmp_dir = tempdir().unwrap();
|
||||
@@ -3688,6 +3726,10 @@ mod tests {
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Int32, false),
|
||||
Field::new("category", DataType::Utf8, true),
|
||||
Field::new("large_category", DataType::LargeUtf8, true),
|
||||
Field::new("is_active", DataType::Boolean, true),
|
||||
Field::new("data", DataType::Binary, true),
|
||||
Field::new("large_data", DataType::LargeBinary, true),
|
||||
]));
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
@@ -3697,6 +3739,16 @@ mod tests {
|
||||
Arc::new(StringArray::from_iter_values(
|
||||
(0..100).map(|i| format!("category_{}", i % 5)),
|
||||
)),
|
||||
Arc::new(LargeStringArray::from_iter_values(
|
||||
(0..100).map(|i| format!("large_category_{}", i % 5)),
|
||||
)),
|
||||
Arc::new(BooleanArray::from_iter((0..100).map(|i| Some(i % 2 == 0)))),
|
||||
Arc::new(BinaryArray::from_iter_values(
|
||||
(0_u32..100).map(|i| i.to_le_bytes()),
|
||||
)),
|
||||
Arc::new(LargeBinaryArray::from_iter_values(
|
||||
(0_u32..100).map(|i| i.to_le_bytes()),
|
||||
)),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
@@ -3717,12 +3769,58 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create bitmap index on the "is_active" column
|
||||
table
|
||||
.create_index(&["is_active"], Index::Bitmap(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create bitmap index on the "data" column
|
||||
table
|
||||
.create_index(&["data"], Index::Bitmap(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create bitmap index on the "large_data" column
|
||||
table
|
||||
.create_index(&["large_data"], Index::Bitmap(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Create bitmap index on the "large_category" column
|
||||
table
|
||||
.create_index(&["large_category"], Index::Bitmap(Default::default()))
|
||||
.execute()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Verify the index was created
|
||||
let index_configs = table.list_indices().await.unwrap();
|
||||
assert_eq!(index_configs.len(), 1);
|
||||
let index = index_configs.into_iter().next().unwrap();
|
||||
assert_eq!(index_configs.len(), 5);
|
||||
|
||||
let mut configs_iter = index_configs.into_iter();
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["category".to_string()]);
|
||||
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["is_active".to_string()]);
|
||||
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["data".to_string()]);
|
||||
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["large_data".to_string()]);
|
||||
|
||||
let index = configs_iter.next().unwrap();
|
||||
assert_eq!(index.index_type, crate::index::IndexType::Bitmap);
|
||||
assert_eq!(index.columns, vec!["large_category".to_string()]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -22,6 +22,7 @@ pub struct MergeInsertBuilder {
|
||||
pub(crate) when_not_matched_by_source_delete: bool,
|
||||
pub(crate) when_not_matched_by_source_delete_filt: Option<String>,
|
||||
pub(crate) timeout: Option<Duration>,
|
||||
pub(crate) use_index: bool,
|
||||
}
|
||||
|
||||
impl MergeInsertBuilder {
|
||||
@@ -35,6 +36,7 @@ impl MergeInsertBuilder {
|
||||
when_not_matched_by_source_delete: false,
|
||||
when_not_matched_by_source_delete_filt: None,
|
||||
timeout: None,
|
||||
use_index: true,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +103,19 @@ impl MergeInsertBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Controls whether to use indexes for the merge operation.
|
||||
///
|
||||
/// When set to `true` (the default), the operation will use an index if available
|
||||
/// on the join key for improved performance. When set to `false`, it forces a full
|
||||
/// table scan even if an index exists. This can be useful for benchmarking or when
|
||||
/// the query optimizer chooses a suboptimal path.
|
||||
///
|
||||
/// If not set, defaults to `true` (use index if available).
|
||||
pub fn use_index(&mut self, use_index: bool) -> &mut Self {
|
||||
self.use_index = use_index;
|
||||
self
|
||||
}
|
||||
|
||||
/// Executes the merge insert operation
|
||||
///
|
||||
/// Returns version and statistics about the merge operation including the number of rows
|
||||
|
||||
126
rust/lancedb/src/test_connection.rs
Normal file
126
rust/lancedb/src/test_connection.rs
Normal file
@@ -0,0 +1,126 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The LanceDB Authors
|
||||
|
||||
//! Functions for testing connections.
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test_utils {
|
||||
use regex::Regex;
|
||||
use std::env;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::process::{Child, ChildStdout, Command, Stdio};
|
||||
|
||||
use crate::{connect, Connection};
|
||||
use anyhow::{bail, Result};
|
||||
use tempfile::{tempdir, TempDir};
|
||||
|
||||
pub struct TestConnection {
|
||||
pub uri: String,
|
||||
pub connection: Connection,
|
||||
_temp_dir: Option<TempDir>,
|
||||
_process: Option<TestProcess>,
|
||||
}
|
||||
|
||||
struct TestProcess {
|
||||
child: Child,
|
||||
}
|
||||
|
||||
impl Drop for TestProcess {
|
||||
#[allow(unused_must_use)]
|
||||
fn drop(&mut self) {
|
||||
self.child.kill();
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn new_test_connection() -> Result<TestConnection> {
|
||||
match env::var("CREATE_LANCEDB_TEST_CONNECTION_SCRIPT") {
|
||||
Ok(script_path) => new_remote_connection(&script_path).await,
|
||||
Err(_e) => new_local_connection().await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn new_remote_connection(script_path: &str) -> Result<TestConnection> {
|
||||
let temp_dir = tempdir()?;
|
||||
let data_path = temp_dir.path().to_str().unwrap().to_string();
|
||||
let child_result = Command::new(script_path)
|
||||
.stdin(Stdio::null())
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.arg(data_path.clone())
|
||||
.spawn();
|
||||
if child_result.is_err() {
|
||||
bail!(format!(
|
||||
"Unable to run {}: {:?}",
|
||||
script_path,
|
||||
child_result.err()
|
||||
));
|
||||
}
|
||||
let mut process = TestProcess {
|
||||
child: child_result.unwrap(),
|
||||
};
|
||||
let stdout = BufReader::new(process.child.stdout.take().unwrap());
|
||||
let port = read_process_port(stdout)?;
|
||||
let uri = "db://test";
|
||||
let host_override = format!("http://localhost:{}", port);
|
||||
let connection = create_new_connection(uri, &host_override).await?;
|
||||
Ok(TestConnection {
|
||||
uri: uri.to_string(),
|
||||
connection,
|
||||
_temp_dir: Some(temp_dir),
|
||||
_process: Some(process),
|
||||
})
|
||||
}
|
||||
|
||||
fn read_process_port(mut stdout: BufReader<ChildStdout>) -> Result<String> {
|
||||
let mut line = String::new();
|
||||
let re = Regex::new(r"Query node now listening on 0.0.0.0:(.*)").unwrap();
|
||||
loop {
|
||||
let result = stdout.read_line(&mut line);
|
||||
if let Err(err) = result {
|
||||
bail!(format!(
|
||||
"read_process_port: error while reading from process output: {}",
|
||||
err
|
||||
));
|
||||
} else if result.unwrap() == 0 {
|
||||
bail!("read_process_port: hit EOF before reading port from process output.");
|
||||
}
|
||||
if re.is_match(&line) {
|
||||
let caps = re.captures(&line).unwrap();
|
||||
return Ok(caps[1].to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "remote")]
|
||||
async fn create_new_connection(
|
||||
uri: &str,
|
||||
host_override: &str,
|
||||
) -> crate::error::Result<Connection> {
|
||||
connect(uri)
|
||||
.region("us-east-1")
|
||||
.api_key("sk_localtest")
|
||||
.host_override(host_override)
|
||||
.execute()
|
||||
.await
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "remote"))]
|
||||
async fn create_new_connection(
|
||||
_uri: &str,
|
||||
_host_override: &str,
|
||||
) -> crate::error::Result<Connection> {
|
||||
panic!("remote feature not supported");
|
||||
}
|
||||
|
||||
async fn new_local_connection() -> Result<TestConnection> {
|
||||
let temp_dir = tempdir()?;
|
||||
let uri = temp_dir.path().to_str().unwrap();
|
||||
let connection = connect(uri).execute().await?;
|
||||
Ok(TestConnection {
|
||||
uri: uri.to_string(),
|
||||
connection,
|
||||
_temp_dir: Some(temp_dir),
|
||||
_process: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -195,7 +195,15 @@ pub fn supported_btree_data_type(dtype: &DataType) -> bool {
|
||||
}
|
||||
|
||||
pub fn supported_bitmap_data_type(dtype: &DataType) -> bool {
|
||||
dtype.is_integer() || matches!(dtype, DataType::Utf8)
|
||||
dtype.is_integer()
|
||||
|| matches!(
|
||||
dtype,
|
||||
DataType::Utf8
|
||||
| DataType::LargeUtf8
|
||||
| DataType::Binary
|
||||
| DataType::LargeBinary
|
||||
| DataType::Boolean
|
||||
)
|
||||
}
|
||||
|
||||
pub fn supported_label_list_data_type(dtype: &DataType) -> bool {
|
||||
|
||||
Reference in New Issue
Block a user