Merge branch 'main' into inst-mani-fast-path

This commit is contained in:
Ruihang Xia
2026-05-13 16:59:28 +08:00
841 changed files with 80835 additions and 26044 deletions

View File

@@ -37,17 +37,14 @@ inputs:
description: Whether to push the latest tag of the image
required: false
default: 'true'
aws-cn-s3-bucket:
description: S3 bucket to store released artifacts in CN region
proxy-url:
description: The url of the S3 proxy server
required: true
aws-cn-access-key-id:
description: AWS access key id in CN region
proxy-username:
description: The username of the S3 proxy
required: true
aws-cn-secret-access-key:
description: AWS secret access key in CN region
required: true
aws-cn-region:
description: AWS region in CN
proxy-password:
description: The password of the S3 proxy
required: true
upload-to-s3:
description: Upload to S3
@@ -77,21 +74,13 @@ runs:
with:
path: ${{ inputs.artifacts-dir }}
- name: Install s5cmd
shell: bash
run: |
wget https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz
tar -xzf s5cmd_2.3.0_Linux-64bit.tar.gz
sudo mv s5cmd /usr/local/bin/
sudo chmod +x /usr/local/bin/s5cmd
- name: Release artifacts to cn region
uses: nick-invision/retry@v2
if: ${{ inputs.upload-to-s3 == 'true' }}
env:
AWS_ACCESS_KEY_ID: ${{ inputs.aws-cn-access-key-id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-cn-secret-access-key }}
AWS_REGION: ${{ inputs.aws-cn-region }}
PROXY_URL: ${{ inputs.proxy-url }}
PROXY_USERNAME: ${{ inputs.proxy-username }}
PROXY_PASSWORD: ${{ inputs.proxy-password }}
UPDATE_VERSION_INFO: ${{ inputs.update-version-info }}
with:
max_attempts: ${{ inputs.upload-max-retry-times }}
@@ -99,8 +88,7 @@ runs:
command: |
./.github/scripts/upload-artifacts-to-s3.sh \
${{ inputs.artifacts-dir }} \
${{ inputs.version }} \
${{ inputs.aws-cn-s3-bucket }}
${{ inputs.version }}
- name: Push greptimedb image from Dockerhub to ACR
shell: bash

View File

@@ -1,15 +1,16 @@
name: Setup Kind
description: Deploy Kind
name: Setup Chaos Mesh
description: Install and wait for Chaos Mesh
runs:
using: composite
steps:
- uses: actions/checkout@v4
- name: Create kind cluster
- name: Install Chaos Mesh
shell: bash
run: |
helm repo add chaos-mesh https://charts.chaos-mesh.org
helm repo update chaos-mesh
kubectl create ns chaos-mesh
helm install chaos-mesh chaos-mesh/chaos-mesh -n=chaos-mesh --version 2.6.3
helm install chaos-mesh chaos-mesh/chaos-mesh -n=chaos-mesh --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock --version 2.8.0
- name: Print Chaos-mesh
if: always()
shell: bash

View File

@@ -1,3 +1,8 @@
logging:
level: "info"
format: "json"
filters:
- mito2::sst::file=debug
meta:
configData: |-
[runtime]

View File

@@ -1,3 +1,2 @@
native-tls
openssl
aws-lc-sys

View File

@@ -30,13 +30,72 @@ CLEAN_LATEST=$(echo "$LATEST_VERSION" | sed 's/^v//' | sed 's/-nightly-.*//')
echo "Current version: $CLEAN_CURRENT"
echo "Latest release version: $CLEAN_LATEST"
# Use sort -V to compare versions
HIGHER_VERSION=$(printf "%s\n%s" "$CLEAN_CURRENT" "$CLEAN_LATEST" | sort -V | tail -n1)
# Function to extract base version (without pre-release suffix)
get_base_version() {
echo "$1" | sed -E 's/-(alpha|beta|rc|pre).*//'
}
if [ "$HIGHER_VERSION" = "$CLEAN_CURRENT" ]; then
# Function to check if a version is pre-release
is_prerelease() {
[[ "$1" =~ -(alpha|beta|rc|pre) ]]
}
# Compare versions properly considering pre-release
compare_versions() {
local current=$1
local latest=$2
# Extract base versions
local current_base=$(get_base_version "$current")
local latest_base=$(get_base_version "$latest")
# Compare base versions first
HIGHER_BASE=$(printf "%s\n%s" "$current_base" "$latest_base" | sort -V | tail -n1)
if [ "$HIGHER_BASE" = "$latest_base" ] && [ "$current_base" != "$latest_base" ]; then
# Latest has higher base version
echo "current_older"
return
elif [ "$HIGHER_BASE" = "$current_base" ] && [ "$current_base" != "$latest_base" ]; then
# Current has higher base version
echo "current_newer"
return
fi
# Base versions are equal, compare pre-release status
if [ "$current_base" = "$latest_base" ]; then
# If current is pre-release and latest is not, current is older
if is_prerelease "$current" && ! is_prerelease "$latest"; then
echo "current_older"
return
fi
# If latest is pre-release and current is not, current is newer
if ! is_prerelease "$current" && is_prerelease "$latest"; then
echo "current_newer"
return
fi
fi
# Both are same type or different base versions already handled, use sort -V
HIGHER_VERSION=$(printf "%s\n%s" "$current" "$latest" | sort -V | tail -n1)
if [ "$HIGHER_VERSION" = "$current" ]; then
echo "current_newer_or_equal"
else
echo "current_older"
fi
}
RESULT=$(compare_versions "$CLEAN_CURRENT" "$CLEAN_LATEST")
if [ "$RESULT" = "current_newer" ] || [ "$RESULT" = "current_newer_or_equal" ]; then
echo "Current version ($CLEAN_CURRENT) is NEWER than or EQUAL to latest ($CLEAN_LATEST)"
echo "is-current-version-latest=true" >> $GITHUB_OUTPUT
if [ -n "$GITHUB_OUTPUT" ]; then
echo "is-current-version-latest=true" >> $GITHUB_OUTPUT
fi
else
echo "Current version ($CLEAN_CURRENT) is OLDER than latest ($CLEAN_LATEST)"
echo "is-current-version-latest=false" >> $GITHUB_OUTPUT
if [ -n "$GITHUB_OUTPUT" ]; then
echo "is-current-version-latest=false" >> $GITHUB_OUTPUT
fi
fi

View File

@@ -30,8 +30,11 @@ update_dev_builder_version() {
--body "This PR updates the dev-builder image tag" \
--base main \
--head $BRANCH_NAME \
--reviewer zyy17 \
--reviewer daviderli614
--reviewer sunng87 \
--reviewer daviderli614 \
--reviewer killme2008 \
--reviewer evenyag \
--reviewer fengjiachun
}
update_dev_builder_version

View File

@@ -5,16 +5,15 @@ set -o pipefail
ARTIFACTS_DIR=$1
VERSION=$2
AWS_S3_BUCKET=$3
RELEASE_DIRS="releases/greptimedb"
GREPTIMEDB_REPO="GreptimeTeam/greptimedb"
# Check if necessary variables are set.
function check_vars() {
for var in AWS_S3_BUCKET VERSION ARTIFACTS_DIR; do
for var in VERSION ARTIFACTS_DIR; do
if [ -z "${!var}" ]; then
echo "$var is not set or empty."
echo "Usage: $0 <artifacts-dir> <version> <aws-s3-bucket>"
echo "Usage: $0 <artifacts-dir> <version>"
exit 1
fi
done
@@ -33,8 +32,18 @@ function upload_artifacts() {
# ├── greptime-darwin-amd64-v0.2.0.sha256sum
# └── greptime-darwin-amd64-v0.2.0.tar.gz
find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do
s5cmd cp \
"$file" "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/$VERSION/$(basename "$file")"
filename=$(basename "$file")
TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION"
curl -X PUT \
-u "$PROXY_USERNAME:$PROXY_PASSWORD" \
-F "file=@$file" \
--max-time 3600 \
--connect-timeout 20 \
--retry 5 \
--retry-delay 10 \
--retry-max-time 3000 \
"$TARGET_URL"
done
}
@@ -45,16 +54,34 @@ function update_version_info() {
if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "Updating latest-version.txt"
echo "$VERSION" > latest-version.txt
s5cmd cp \
latest-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-version.txt"
TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
curl -X PUT \
-u "$PROXY_USERNAME:$PROXY_PASSWORD" \
-F "file=@latest-version.txt" \
--max-time 3600 \
--connect-timeout 20 \
--retry 5 \
--retry-delay 10 \
--retry-max-time 3000 \
"$TARGET_URL"
fi
# If it's the nightly release, update latest-nightly-version.txt.
if [[ "$VERSION" == *"nightly"* ]]; then
echo "Updating latest-nightly-version.txt"
echo "$VERSION" > latest-nightly-version.txt
s5cmd cp \
latest-nightly-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-nightly-version.txt"
TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
curl -X PUT \
-u "$PROXY_USERNAME:$PROXY_PASSWORD" \
-F "file=@latest-nightly-version.txt" \
--max-time 3600 \
--connect-timeout 20 \
--retry 5 \
--retry-delay 10 \
--retry-max-time 3000 \
"$TARGET_URL"
fi
fi
}
@@ -93,10 +120,10 @@ function main() {
}
# Usage example:
# AWS_ACCESS_KEY_ID=<your_access_key_id> \
# AWS_SECRET_ACCESS_KEY=<your_secret_access_key> \
# AWS_DEFAULT_REGION=<your_region> \
# PROXY_URL=<proxy_url> \
# PROXY_USERNAME=<proxy_username> \
# PROXY_PASSWORD=<proxy_password> \
# UPDATE_VERSION_INFO=true \
# DOWNLOAD_ARTIFACTS_FROM_GITHUB=false \
# ./upload-artifacts-to-s3.sh <artifacts-dir> <version> <aws-s3-bucket>
# ./upload-artifacts-to-s3.sh <artifacts-dir> <version>
main

View File

@@ -0,0 +1,29 @@
name: Bump helm charts version
on:
workflow_dispatch:
inputs:
version:
description: The version to bump (e.g. v1.0.0)
required: true
type: string
jobs:
bump-helm-charts-version:
name: Bump helm charts version
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Bump helm charts version
env:
GITHUB_TOKEN: ${{ secrets.HELM_CHARTS_REPO_TOKEN }}
VERSION: ${{ inputs.version }}
run: |
./.github/scripts/update-helm-charts-version.sh

View File

@@ -0,0 +1,29 @@
name: Bump homebrew greptime version
on:
workflow_dispatch:
inputs:
version:
description: The version to bump (e.g. v1.0.0)
required: true
type: string
jobs:
bump-homebrew-greptime-version:
name: Bump homebrew greptime version
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Bump homebrew greptime version
env:
GITHUB_TOKEN: ${{ secrets.HOMEBREW_GREPTIME_REPO_TOKEN }}
VERSION: ${{ inputs.version }}
run: |
./.github/scripts/update-homebrew-greptme-version.sh

View File

@@ -285,10 +285,9 @@ jobs:
dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
version: ${{ needs.allocate-runners.outputs.version }}
aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
proxy-url: ${{ secrets.PROXY_URL }}
proxy-username: ${{ secrets.PROXY_USERNAME }}
proxy-password: ${{ secrets.PROXY_PASSWORD }}
upload-to-s3: ${{ inputs.upload_artifacts_to_s3 }}
dev-mode: true # Only build the standard images(exclude centos images).
push-latest-tag: false # Don't push the latest tag to registry.

View File

@@ -319,7 +319,13 @@ jobs:
include:
- target: "fuzz_repartition_table"
mode:
name: "Local WAL Repartition GC"
name: "Local WAL mito table repartition"
minio: true
kafka: false
values: "with-minio-repartition-gc.yaml"
- target: "fuzz_repartition_metric_table"
mode:
name: "Local WAL metric table repartition"
minio: true
kafka: false
values: "with-minio-repartition-gc.yaml"
@@ -455,6 +461,14 @@ jobs:
path: /tmp/fuzz-monitor-dumps
if-no-files-found: warn
retention-days: 3
- name: Upload CSV dumps
if: failure()
uses: actions/upload-artifact@v4
with:
name: fuzz-tests-csv-dumps-${{ matrix.mode.name }}-${{ matrix.target }}
path: /tmp/greptime-fuzz-dumps
if-no-files-found: warn
retention-days: 3
- name: Delete cluster
if: success()
shell: bash
@@ -492,6 +506,12 @@ jobs:
minio: true
kafka: false
values: "with-minio.yaml"
- target: "fuzz_repartition_table_chaos"
mode:
name: "Local WAL repartition chaos"
minio: true
kafka: false
values: "with-minio-repartition-gc.yaml"
steps:
- name: Remove unused software
run: |

View File

@@ -236,10 +236,9 @@ jobs:
dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
version: ${{ needs.allocate-runners.outputs.version }}
aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
proxy-url: ${{ secrets.PROXY_URL }}
proxy-username: ${{ secrets.PROXY_USERNAME }}
proxy-password: ${{ secrets.PROXY_PASSWORD }}
upload-to-s3: false
dev-mode: false
update-version-info: false # Don't update version info in S3.

View File

@@ -358,10 +358,9 @@ jobs:
dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
version: ${{ needs.allocate-runners.outputs.version }}
aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
proxy-url: ${{ secrets.PROXY_URL }}
proxy-username: ${{ secrets.PROXY_USERNAME }}
proxy-password: ${{ secrets.PROXY_PASSWORD }}
dev-mode: false
upload-to-s3: true
update-version-info: true

View File

@@ -127,7 +127,7 @@ jobs:
run: |
./bins/greptime standalone start \
--http-addr 0.0.0.0:${{ inputs.http-port }} \
--rpc-addr 0.0.0.0:4001 \
--grpc-bind-addr 0.0.0.0:4001 \
--mysql-addr 0.0.0.0:${{ inputs.mysql-port }} \
--postgres-addr 0.0.0.0:${{ inputs.postgres-port }} \
--user-provider=static_user_provider:cmd:${{ inputs.username }}=${{ inputs.password }} > /tmp/greptimedb.log 2>&1 &

12
.gitignore vendored
View File

@@ -65,8 +65,14 @@ greptimedb_data
# github
!/.github
# Claude code
# AI related
CLAUDE.md
# AGENTS.md
AGENTS.md
.codex
.gemini
.opencode
.worktrees/
# local design docs
docs/specs/
.vs/

1290
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -75,7 +75,7 @@ members = [
resolver = "2"
[workspace.package]
version = "1.0.0-rc.2"
version = "1.0.0"
edition = "2024"
license = "Apache-2.0"
@@ -110,6 +110,7 @@ arrow-schema = { version = "57.3", features = ["serde"] }
async-stream = "0.3"
async-trait = "0.1"
# Remember to update axum-extra, axum-macros when updating axum
arrow_object_store = { package = "object_store", version = "0.13.2" }
axum = "0.8"
axum-extra = "0.10"
axum-macros = "0.5"
@@ -131,6 +132,7 @@ datafusion = "=52.1"
datafusion-common = "=52.1"
datafusion-datasource = "=52.1"
datafusion-expr = "=52.1"
datafusion-expr-common = "=52.1"
datafusion-functions = "=52.1"
datafusion-functions-aggregate-common = "=52.1"
datafusion-functions-window-common = "=52.1"
@@ -141,6 +143,7 @@ datafusion-physical-expr = "=52.1"
datafusion-physical-plan = "=52.1"
datafusion-sql = "=52.1"
datafusion-substrait = "=52.1"
datafusion_object_store = { package = "object_store", version = "0.12.5" }
deadpool = "0.12"
deadpool-postgres = "0.14"
derive_builder = "0.20"
@@ -151,16 +154,18 @@ etcd-client = { version = "0.17", features = [
"tls",
"tls-roots",
] }
fs2 = "0.4"
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "092ba1d01e2da676dca66cca7eebb55009da8ef8" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0de5437582920c8b30d6c34212f161db71d95c50" }
hex = "0.4"
http = "1"
humantime = "2.1"
humantime-serde = "1.1"
hyper = "1.1"
hyper-util = "0.1"
icu_properties = "2.0.1"
itertools = "0.14"
jsonb = { version = "0.4.4", default-features = false }
lazy_static = "1.4"
@@ -173,7 +178,7 @@ nalgebra = "0.33"
nix = { version = "0.30.1", default-features = false, features = ["event", "fs", "process"] }
notify = "8.0"
num_cpus = "1.16"
object_store_opendal = "0.54"
object_store_opendal = "0.56"
once_cell = "1.18"
opentelemetry-proto = { version = "0.31", features = [
"gen-tonic",
@@ -200,14 +205,17 @@ rand = "0.9"
ratelimit = "0.10"
regex = "1.12"
regex-automata = "0.4"
reqwest = { version = "0.12", default-features = false, features = [
reqwest = { version = "0.13", default-features = false, features = [
"form",
"json",
"rustls-tls-native-roots",
"query",
"rustls",
"stream",
"multipart",
] }
url = "2.3"
# Branch: feat/request-timeout
hostname = "0.4.0"
rskafka = { git = "https://github.com/GreptimeTeam/rskafka.git", rev = "f5688f83e7da591cda3f2674c2408b4c0ed4ed50", features = [
"transport-tls",
] }
@@ -215,8 +223,6 @@ rstest = "0.25"
rstest_reuse = "0.7"
rust_decimal = "1.33"
rustc-hash = "2.0"
# It is worth noting that we should try to avoid using aws-lc-rs until it can be compiled on various platforms.
hostname = "0.4.0"
rustls = { version = "0.23.25", default-features = false }
sea-query = "0.32"
serde = { version = "1.0", features = ["derive"] }
@@ -231,7 +237,8 @@ sqlx = { version = "0.8", default-features = false, features = [
"any",
"macros",
"json",
"runtime-tokio-rustls",
"runtime-tokio",
"tls-rustls-aws-lc-rs",
"rust_decimal",
] }
strum = { version = "0.27", features = ["derive"] }
@@ -243,7 +250,7 @@ tokio-rustls = { version = "0.26.2", default-features = false }
tokio-stream = "0.1"
tokio-util = { version = "0.7", features = ["io-util", "compat"] }
toml = "0.8.8"
tonic = { version = "0.14", features = ["tls-ring", "gzip", "zstd"] }
tonic = { version = "0.14", features = ["tls-aws-lc", "gzip", "zstd"] }
tower = "0.5"
tower-http = "0.6"
tracing = "0.1"
@@ -334,6 +341,7 @@ rev = "5618e779cf2bb4755b499c630fba4c35e91898cb"
datafusion = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "02b82535e0160c4545667f36a03e1ff9d1d2e51f" }
datafusion-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "02b82535e0160c4545667f36a03e1ff9d1d2e51f" }
datafusion-expr = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "02b82535e0160c4545667f36a03e1ff9d1d2e51f" }
datafusion-expr-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "02b82535e0160c4545667f36a03e1ff9d1d2e51f" }
datafusion-functions = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "02b82535e0160c4545667f36a03e1ff9d1d2e51f" }
datafusion-functions-aggregate-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "02b82535e0160c4545667f36a03e1ff9d1d2e51f" }
datafusion-functions-window-common = { git = "https://github.com/GreptimeTeam/datafusion.git", rev = "02b82535e0160c4545667f36a03e1ff9d1d2e51f" }

View File

@@ -8,7 +8,7 @@ CARGO_BUILD_OPTS := --locked
IMAGE_REGISTRY ?= docker.io
IMAGE_NAMESPACE ?= greptime
IMAGE_TAG ?= latest
DEV_BUILDER_IMAGE_TAG ?= 2025-10-01-8fe17d43-20251011080129
DEV_BUILDER_IMAGE_TAG ?= 2026-03-21-9c9d9e9e-20260331090344
BUILDX_MULTI_PLATFORM_BUILD ?= false
BUILDX_BUILDER_NAME ?= gtbuilder
BASE_IMAGE ?= ubuntu

View File

@@ -131,7 +131,7 @@ docker run -p 127.0.0.1:4000-4003:4000-4003 \
--name greptime --rm \
greptime/greptimedb:latest standalone start \
--http-addr 0.0.0.0:4000 \
--rpc-bind-addr 0.0.0.0:4001 \
--grpc-bind-addr 0.0.0.0:4001 \
--mysql-addr 0.0.0.0:4002 \
--postgres-addr 0.0.0.0:4003
```
@@ -175,17 +175,16 @@ cargo run -- standalone start
## Project Status
> **Status:** RC — marching toward v1.0 GA!
> **GA (v1.0):** March 2026
> **Status:** [v1.0 GA](https://github.com/GreptimeTeam/greptimedb/releases/tag/v1.0.0) — generally available and production-ready! 🎉
- Deployed in production handling billions of data points daily
- Stable APIs, actively maintained, with regular releases ([version info](https://docs.greptime.com/nightly/reference/about-greptimedb-version))
GreptimeDB v1.0 represents a major milestone toward maturity — marking stable APIs, production readiness, and proven performance.
GreptimeDB v1.0 marks a major milestone stable APIs, production readiness, and proven performance at scale.
**Roadmap:** [v1.0 highlights and release plan](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) and [2026 roadmap](https://greptime.com/blogs/2026-02-11-greptimedb-roadmap-2026).
**Learn more:** [v1.0 highlights](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) and [2026 roadmap](https://greptime.com/blogs/2026-02-11-greptimedb-roadmap-2026).
For production use, we recommend using the latest stable release.
For production use, we recommend v1.0 or later.
If you find this project useful, a ⭐ would mean a lot to us!

View File

@@ -12,7 +12,9 @@ footer = ""
body = """
# {{ version }}
{% if timestamp -%}
Release date: {{ timestamp | date(format="%B %d, %Y") }}
{% endif -%}
{%- set breakings = commits | filter(attribute="breaking", value=true) -%}
{%- if breakings | length > 0 %}
@@ -118,7 +120,10 @@ filter_commits = false
# regex for skipping tags
# skip_tags = ""
# regex for ignoring tags
ignore_tags = ".*-nightly-.*"
# Ignore nightly tags and build-suffixed release tags such as
# v1.0.0-rc.2-13cdfa9b5-20260325-1774407105 so their commits are merged into
# the next visible release section instead of creating extra headings.
ignore_tags = ".*-nightly-.*|^v[0-9]+\\.[0-9]+\\.[0-9]+(-(alpha|beta|rc)\\.[0-9]+)?-[0-9a-f]{7,}-[0-9]{8}-[0-9]+$"
# sort the tags topologically
topo_order = false
# sort the commits inside sections by oldest/newest order

View File

@@ -14,11 +14,12 @@
| --- | -----| ------- | ----------- |
| `default_timezone` | String | Unset | The default timezone of the server. |
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
| `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). |
| `max_concurrent_queries` | Integer | `0` | The maximum concurrent queries allowed to be executed. Zero means unlimited. |
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
| `runtime` | -- | -- | The runtime options. |
| `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
@@ -69,6 +70,11 @@
| `prom_store` | -- | -- | Prometheus remote storage options |
| `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. |
| `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. |
| `prom_store.pending_rows_flush_interval` | String | `0s` | Interval to flush pending rows batcher.<br/>Set to "0s" to disable batching mode in Prometheus Remote Write endpoint |
| `prom_store.max_batch_rows` | Integer | `100000` | Max rows per pending batch before triggering a flush. |
| `prom_store.max_concurrent_flushes` | Integer | `256` | Max number of concurrent batch flushes. |
| `prom_store.worker_channel_capacity` | Integer | `65526` | Capacity of the pending batch worker channel. |
| `prom_store.max_inflight_requests` | Integer | `3000` | Max inflight write requests before backpressure. |
| `wal` | -- | -- | The WAL options. |
| `wal.provider` | String | `raft_engine` | The provider of the WAL.<br/>- `raft_engine`: the wal is stored in the local file system by raft-engine.<br/>- `kafka`: it's remote wal that data is stored in Kafka. |
| `wal.dir` | String | Unset | The directory to store the WAL files.<br/>**It's only used when the provider is `raft_engine`**. |
@@ -139,7 +145,7 @@
| `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
| `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
| `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). |
| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. |
| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks.<br/>Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%").<br/>Setting it to 0 or "unlimited" disables the limit. |
| `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.<br/>Options: "wait" (default, 10s), "wait(<duration>)", "fail" |
| `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
| `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
@@ -157,12 +163,12 @@
| `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).<br/>When disabled, cache refilling on read won't happen. |
| `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). |
| `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
| `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
| `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit.<br/>NOTE: Works with max_concurrent_queries for tiered memory allocation.<br/>- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.<br/>- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. |
| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit. |
| `region_engine.mito.scan_memory_on_exhausted` | String | `fail` | Controls what happens when a scan cannot get memory immediately.<br/>"fail" (default) fails fast and is the recommended option for most users.<br/>"wait" / "wait(<duration>)" waits for memory to become available. This is mainly<br/>for advanced tuning in bursty workloads where temporary contention is common and<br/>higher latency is acceptable.<br/>"wait" means "wait(10s)", not unlimited waiting. |
| `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. |
| `region_engine.mito.default_flat_format` | Bool | `true` | Whether to enable flat format as the default SST format. |
| `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
| `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for<br/>creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.<br/>The default name for this directory is `index_intermediate` for backward compatibility.<br/><br/>This path contains two subdirectories:<br/>- `__intm`: for storing intermediate files used during creating index.<br/>- `staging`: for storing staging files used during searching index. |
| `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. |
@@ -226,14 +232,12 @@
| --- | -----| ------- | ----------- |
| `default_timezone` | String | Unset | The default timezone of the server. |
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
| `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
| `runtime` | -- | -- | The runtime options. |
| `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
| `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
| `heartbeat` | -- | -- | The heartbeat options. |
| `heartbeat.interval` | String | `18s` | Interval for sending heartbeat messages to the metasrv. |
| `heartbeat.retry_interval` | String | `3s` | Interval for retrying to send heartbeat messages to the metasrv. |
| `http` | -- | -- | The HTTP server options. |
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
| `http.timeout` | String | `0s` | HTTP request timeout. Set to 0 to disable timeout. |
@@ -254,7 +258,7 @@
| `grpc.tls.watch` | Bool | `false` | Watch for Certificate and key file change and auto reload.<br/>For now, gRPC tls config does not support auto reload. |
| `internal_grpc` | -- | -- | The internal gRPC server options. Internal gRPC port for nodes inside cluster to access frontend. |
| `internal_grpc.bind_addr` | String | `127.0.0.1:4010` | The address to bind the gRPC server. |
| `internal_grpc.server_addr` | String | `127.0.0.1:4010` | The address advertised to the metasrv, and used for connections from outside the host.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `grpc.bind_addr`. |
| `internal_grpc.server_addr` | String | `127.0.0.1:4010` | The address advertised to the metasrv, and used for connections from outside the host.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `internal_grpc.bind_addr`. |
| `internal_grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
| `internal_grpc.flight_compression` | String | `arrow_ipc` | Compression mode for frontend side Arrow IPC service. Available options:<br/>- `none`: disable all compression<br/>- `transport`: only enable gRPC transport compression (zstd)<br/>- `arrow_ipc`: only enable Arrow IPC compression (lz4)<br/>- `all`: enable all compression.<br/>Default to `none` |
| `internal_grpc.tls` | -- | -- | internal gRPC server TLS options, see `mysql.tls` section. |
@@ -292,6 +296,11 @@
| `prom_store` | -- | -- | Prometheus remote storage options |
| `prom_store.enable` | Bool | `true` | Whether to enable Prometheus remote write and read in HTTP API. |
| `prom_store.with_metric_engine` | Bool | `true` | Whether to store the data from Prometheus remote write in metric engine. |
| `prom_store.pending_rows_flush_interval` | String | `0s` | Interval to flush pending rows batcher.<br/>Set to "0s" to disable batching mode in Prometheus Remote Write endpoint |
| `prom_store.max_batch_rows` | Integer | `100000` | Max rows per pending batch before triggering a flush. |
| `prom_store.max_concurrent_flushes` | Integer | `256` | Max number of concurrent batch flushes. |
| `prom_store.worker_channel_capacity` | Integer | `65526` | Capacity of the pending batch worker channel. |
| `prom_store.max_inflight_requests` | Integer | `3000` | Max inflight write requests before backpressure. |
| `meta_client` | -- | -- | The metasrv client options. |
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
| `meta_client.timeout` | String | `3s` | Operation timeout. |
@@ -352,7 +361,7 @@
| `region_failure_detector_initialization_delay` | String | `10m` | The delay before starting region failure detection.<br/>This delay helps prevent Metasrv from triggering unnecessary region failovers before all Datanodes are fully started.<br/>Especially useful when the cluster is not deployed with GreptimeDB Operator and maintenance mode is not enabled. |
| `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL.<br/>**This option is not recommended to be set to true, because it may lead to data loss during failover.** |
| `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
| `heartbeat_interval` | String | `3s` | Base heartbeat interval for calculating distributed time constants.<br/>The frontend heartbeat interval is 6 times of the base heartbeat interval.<br/>The flownode/datanode heartbeat interval is 1 times of the base heartbeat interval.<br/>e.g., If the base heartbeat interval is 3s, the frontend heartbeat interval is 18s, the flownode/datanode heartbeat interval is 3s.<br/>If you change this value, you need to change the heartbeat interval of the flownode/frontend/datanode accordingly. |
| `heartbeat_interval` | String | `3s` | Base heartbeat interval for calculating distributed time constants.<br/>The frontend heartbeat interval is 6 times of the base heartbeat interval.<br/>The flownode/datanode heartbeat interval is 1 times of the base heartbeat interval.<br/>e.g., If the base heartbeat interval is 3s, the frontend heartbeat interval is 18s, the flownode/datanode heartbeat interval is 3s.<br/>Heartbeat intervals are negotiated from metasrv during handshake; local node configs do not override this. |
| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
| `runtime` | -- | -- | The runtime options. |
| `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
@@ -368,7 +377,7 @@
| `backend_client.connect_timeout` | String | `3s` | The connect timeout for backend client. |
| `grpc` | -- | -- | The gRPC server options. |
| `grpc.bind_addr` | String | `127.0.0.1:3002` | The address to bind the gRPC server. |
| `grpc.server_addr` | String | `127.0.0.1:3002` | The communication server address for the frontend and datanode to connect to metasrv.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `bind_addr`. |
| `grpc.server_addr` | String | `127.0.0.1:3002` | The communication server address for the frontend and datanode to connect to metasrv.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `grpc.bind_addr`. |
| `grpc.runtime_size` | Integer | `8` | The number of server worker threads. |
| `grpc.max_recv_message_size` | String | `512MB` | The maximum receive message size for gRPC server. |
| `grpc.max_send_message_size` | String | `512MB` | The maximum send message size for gRPC server. |
@@ -440,7 +449,7 @@
| `require_lease_before_startup` | Bool | `false` | Start services after regions have obtained leases.<br/>It will block the datanode start if it can't receive leases in the heartbeat from metasrv. |
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
| `max_concurrent_queries` | Integer | `0` | The maximum current queries allowed to be executed. Zero means unlimited.<br/>NOTE: This setting affects scan_memory_limit's privileged tier allocation.<br/>When set, 70% of queries get privileged memory access (full scan_memory_limit).<br/>The remaining 30% get standard tier access (70% of scan_memory_limit). |
| `max_concurrent_queries` | Integer | `0` | The maximum concurrent queries allowed to be executed. Zero means unlimited. |
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
| `http` | -- | -- | The HTTP server options. |
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
@@ -461,9 +470,6 @@
| `runtime` | -- | -- | The runtime options. |
| `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
| `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
| `heartbeat` | -- | -- | The heartbeat options. |
| `heartbeat.interval` | String | `3s` | Interval for sending heartbeat messages to the metasrv. |
| `heartbeat.retry_interval` | String | `3s` | Interval for retrying to send heartbeat messages to the metasrv. |
| `meta_client` | -- | -- | The metasrv client options. |
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
| `meta_client.timeout` | String | `3s` | Operation timeout. |
@@ -531,7 +537,7 @@
| `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
| `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
| `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). |
| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. |
| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks.<br/>Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%").<br/>Setting it to 0 or "unlimited" disables the limit. |
| `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.<br/>Options: "wait" (default, 10s), "wait(<duration>)", "fail" |
| `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
| `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
@@ -549,12 +555,12 @@
| `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).<br/>When disabled, cache refilling on read won't happen. |
| `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). |
| `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
| `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
| `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
| `region_engine.mito.allow_stale_entries` | Bool | `false` | Whether to allow stale WAL entries read during replay. |
| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit.<br/>NOTE: Works with max_concurrent_queries for tiered memory allocation.<br/>- If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.<br/>- If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access. |
| `region_engine.mito.scan_memory_limit` | String | `50%` | Memory limit for table scans across all queries.<br/>Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").<br/>Setting it to 0 disables the limit. |
| `region_engine.mito.scan_memory_on_exhausted` | String | `fail` | Controls what happens when a scan cannot get memory immediately.<br/>"fail" (default) fails fast and is the recommended option for most users.<br/>"wait" / "wait(<duration>)" waits for memory to become available. This is mainly<br/>for advanced tuning in bursty workloads where temporary contention is common and<br/>higher latency is acceptable.<br/>"wait" means "wait(10s)", not unlimited waiting. |
| `region_engine.mito.min_compaction_interval` | String | `0m` | Minimum time interval between two compactions.<br/>To align with the old behavior, the default value is 0 (no restrictions). |
| `region_engine.mito.default_experimental_flat_format` | Bool | `false` | Whether to enable experimental flat format as the default format. |
| `region_engine.mito.default_flat_format` | Bool | `true` | Whether to enable flat format as the default SST format. |
| `region_engine.mito.index` | -- | -- | The options for index in Mito engine. |
| `region_engine.mito.index.aux_path` | String | `""` | Auxiliary directory path for the index in filesystem, used to store intermediate files for<br/>creating the index and staging files for searching the index, defaults to `{data_home}/index_intermediate`.<br/>The default name for this directory is `index_intermediate` for backward compatibility.<br/><br/>This path contains two subdirectories:<br/>- `__intm`: for storing intermediate files used during creating index.<br/>- `staging`: for storing staging files used during searching index. |
| `region_engine.mito.index.staging_size` | String | `2GB` | The max capacity of the staging directory. |
@@ -614,6 +620,7 @@
| Key | Type | Default | Descriptions |
| --- | -----| ------- | ----------- |
| `node_id` | Integer | Unset | The flownode identifier and should be unique in the cluster. |
| `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
| `flow` | -- | -- | flow engine options. |
| `flow.num_workers` | Integer | `0` | The number of flow worker in flownode.<br/>Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
| `flow.batching_mode` | -- | -- | -- |
@@ -623,7 +630,6 @@
| `flow.batching_mode.grpc_conn_timeout` | String | `5s` | The gRPC connection timeout |
| `flow.batching_mode.experimental_grpc_max_retries` | Integer | `3` | The gRPC max retry number |
| `flow.batching_mode.experimental_frontend_scan_timeout` | String | `30s` | Flow wait for available frontend timeout,<br/>if failed to find available frontend after frontend_scan_timeout elapsed, return error<br/>which prevent flownode from starting |
| `flow.batching_mode.experimental_frontend_activity_timeout` | String | `60s` | Frontend activity timeout<br/>if frontend is down(not sending heartbeat) for more than frontend_activity_timeout,<br/>it will be removed from the list that flownode use to connect |
| `flow.batching_mode.experimental_max_filter_num_per_query` | Integer | `20` | Maximum number of filters allowed in a single query |
| `flow.batching_mode.experimental_time_window_merge_threshold` | Integer | `3` | Time window merge distance |
| `flow.batching_mode.read_preference` | String | `Leader` | Read preference of the Frontend client. |
@@ -651,9 +657,6 @@
| `meta_client.metadata_cache_max_capacity` | Integer | `100000` | The configuration about the cache of the metadata. |
| `meta_client.metadata_cache_ttl` | String | `10m` | TTL of the metadata cache. |
| `meta_client.metadata_cache_tti` | String | `5m` | -- |
| `heartbeat` | -- | -- | The heartbeat options. |
| `heartbeat.interval` | String | `3s` | Interval for sending heartbeat messages to the metasrv. |
| `heartbeat.retry_interval` | String | `3s` | Interval for retrying to send heartbeat messages to the metasrv. |
| `logging` | -- | -- | The logging options. |
| `logging.dir` | String | `./greptimedb_data/logs` | The directory to store the log files. If set to empty, logs will not be written to files. |
| `logging.level` | String | Unset | The log level. Can be `info`/`debug`/`warn`/`error`. |

View File

@@ -17,10 +17,7 @@ init_regions_in_background = false
## Parallelism of initializing regions.
init_regions_parallelism = 16
## The maximum current queries allowed to be executed. Zero means unlimited.
## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
## When set, 70% of queries get privileged memory access (full scan_memory_limit).
## The remaining 30% get standard tier access (70% of scan_memory_limit).
## The maximum concurrent queries allowed to be executed. Zero means unlimited.
max_concurrent_queries = 0
## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -83,14 +80,6 @@ watch = false
## The number of threads to execute the runtime for global write operations.
#+ compact_rt_size = 4
## The heartbeat options.
[heartbeat]
## Interval for sending heartbeat messages to the metasrv.
interval = "3s"
## Interval for retrying to send heartbeat messages to the metasrv.
retry_interval = "3s"
## The metasrv client options.
[meta_client]
## The addresses of the metasrv.
@@ -449,7 +438,9 @@ compress_manifest = false
## @toml2docs:none-default="Auto"
#+ max_background_purges = 8
## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
## Memory budget for compaction tasks.
## Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%").
## Setting it to 0 or "unlimited" disables the limit.
## @toml2docs:none-default="0"
#+ experimental_compaction_memory_limit = "0"
@@ -523,9 +514,6 @@ manifest_cache_size = "256MB"
## Buffer size for SST writing.
sst_write_buffer_size = "8MB"
## Capacity of the channel to send data from parallel scan tasks to the main task.
parallel_scan_channel_size = 32
## Maximum number of SST files to scan concurrently.
max_concurrent_scan_files = 384
@@ -535,17 +523,21 @@ allow_stale_entries = false
## Memory limit for table scans across all queries.
## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit.
## NOTE: Works with max_concurrent_queries for tiered memory allocation.
## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
scan_memory_limit = "50%"
## Controls what happens when a scan cannot get memory immediately.
## "fail" (default) fails fast and is the recommended option for most users.
## "wait" / "wait(<duration>)" waits for memory to become available. This is mainly
## for advanced tuning in bursty workloads where temporary contention is common and
## higher latency is acceptable.
## "wait" means "wait(10s)", not unlimited waiting.
scan_memory_on_exhausted = "fail"
## Minimum time interval between two compactions.
## To align with the old behavior, the default value is 0 (no restrictions).
min_compaction_interval = "0m"
## Whether to enable experimental flat format as the default format.
default_experimental_flat_format = false
## Whether to enable flat format as the default SST format.
default_flat_format = true
## The options for index in Mito engine.
[region_engine.mito.index]

View File

@@ -2,6 +2,11 @@
## @toml2docs:none-default
node_id = 14
## The user provider for authentication.
## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
## @toml2docs:none-default
#+ user_provider = "static_user_provider:file:/path/to/users"
## flow engine options.
[flow]
## The number of flow worker in flownode.
@@ -22,10 +27,6 @@ node_id = 14
## if failed to find available frontend after frontend_scan_timeout elapsed, return error
## which prevent flownode from starting
#+experimental_frontend_scan_timeout="30s"
## Frontend activity timeout
## if frontend is down(not sending heartbeat) for more than frontend_activity_timeout,
## it will be removed from the list that flownode use to connect
#+experimental_frontend_activity_timeout="60s"
## Maximum number of filters allowed in a single query
#+experimental_max_filter_num_per_query=20
## Time window merge distance
@@ -96,14 +97,6 @@ metadata_cache_ttl = "10m"
# TTI of the metadata cache.
metadata_cache_tti = "5m"
## The heartbeat options.
[heartbeat]
## Interval for sending heartbeat messages to the metasrv.
interval = "3s"
## Interval for retrying to send heartbeat messages to the metasrv.
retry_interval = "3s"
## The logging options.
[logging]
## The directory to store the log files. If set to empty, logs will not be written to files.

View File

@@ -6,6 +6,11 @@ default_timezone = "UTC"
## @toml2docs:none-default
default_column_prefix = "greptime"
## The user provider for authentication.
## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
## @toml2docs:none-default
#+ user_provider = "static_user_provider:file:/path/to/users"
## Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
## Set to 0 to disable the limit. Default: "0" (unlimited)
## @toml2docs:none-default
@@ -23,14 +28,6 @@ default_column_prefix = "greptime"
## The number of threads to execute the runtime for global write operations.
#+ compact_rt_size = 4
## The heartbeat options.
[heartbeat]
## Interval for sending heartbeat messages to the metasrv.
interval = "18s"
## Interval for retrying to send heartbeat messages to the metasrv.
retry_interval = "3s"
## The HTTP server options.
[http]
## The address to bind the HTTP server.
@@ -100,7 +97,7 @@ watch = false
bind_addr = "127.0.0.1:4010"
## The address advertised to the metasrv, and used for connections from outside the host.
## If left empty or unset, the server will automatically use the IP address of the first network interface
## on the host, with the same port number as the one specified in `grpc.bind_addr`.
## on the host, with the same port number as the one specified in `internal_grpc.bind_addr`.
server_addr = "127.0.0.1:4010"
## The number of server worker threads.
runtime_size = 8
@@ -214,6 +211,17 @@ enable = true
enable = true
## Whether to store the data from Prometheus remote write in metric engine.
with_metric_engine = true
## Interval to flush pending rows batcher.
## Set to "0s" to disable batching mode in Prometheus Remote Write endpoint
#+pending_rows_flush_interval = "0s"
## Max rows per pending batch before triggering a flush.
#+max_batch_rows = 100000
## Max number of concurrent batch flushes.
#+max_concurrent_flushes = 256
## Capacity of the pending batch worker channel.
#+worker_channel_capacity = 65526
## Max inflight write requests before backpressure.
#+max_inflight_requests = 3000
## The metasrv client options.
[meta_client]

View File

@@ -79,7 +79,7 @@ node_max_idle_time = "24hours"
## The frontend heartbeat interval is 6 times of the base heartbeat interval.
## The flownode/datanode heartbeat interval is 1 times of the base heartbeat interval.
## e.g., If the base heartbeat interval is 3s, the frontend heartbeat interval is 18s, the flownode/datanode heartbeat interval is 3s.
## If you change this value, you need to change the heartbeat interval of the flownode/frontend/datanode accordingly.
## Heartbeat intervals are negotiated from metasrv during handshake; local node configs do not override this.
#+ heartbeat_interval = "3s"
## Whether to enable greptimedb telemetry. Enabled by default.
@@ -136,7 +136,7 @@ ca_cert_path = ""
bind_addr = "127.0.0.1:3002"
## The communication server address for the frontend and datanode to connect to metasrv.
## If left empty or unset, the server will automatically use the IP address of the first network interface
## on the host, with the same port number as the one specified in `bind_addr`.
## on the host, with the same port number as the one specified in `grpc.bind_addr`.
server_addr = "127.0.0.1:3002"
## The number of server worker threads.
runtime_size = 8

View File

@@ -6,6 +6,11 @@ default_timezone = "UTC"
## @toml2docs:none-default
default_column_prefix = "greptime"
## The user provider for authentication.
## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
## @toml2docs:none-default
#+ user_provider = "static_user_provider:file:/path/to/users"
## Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
## Set to 0 to disable the limit. Default: "0" (unlimited)
## @toml2docs:none-default
@@ -23,10 +28,7 @@ init_regions_in_background = false
## Parallelism of initializing regions.
init_regions_parallelism = 16
## The maximum current queries allowed to be executed. Zero means unlimited.
## NOTE: This setting affects scan_memory_limit's privileged tier allocation.
## When set, 70% of queries get privileged memory access (full scan_memory_limit).
## The remaining 30% get standard tier access (70% of scan_memory_limit).
## The maximum concurrent queries allowed to be executed. Zero means unlimited.
max_concurrent_queries = 0
## Enable telemetry to collect anonymous usage data. Enabled by default.
@@ -176,6 +178,17 @@ enable = true
enable = true
## Whether to store the data from Prometheus remote write in metric engine.
with_metric_engine = true
## Interval to flush pending rows batcher.
## Set to "0s" to disable batching mode in Prometheus Remote Write endpoint
#+pending_rows_flush_interval = "0s"
## Max rows per pending batch before triggering a flush.
#+max_batch_rows = 100000
## Max number of concurrent batch flushes.
#+max_concurrent_flushes = 256
## Capacity of the pending batch worker channel.
#+worker_channel_capacity = 65526
## Max inflight write requests before backpressure.
#+max_inflight_requests = 3000
## The WAL options.
[wal]
@@ -541,7 +554,9 @@ compress_manifest = false
## @toml2docs:none-default="Auto"
#+ max_background_purges = 8
## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
## Memory budget for compaction tasks.
## Supports absolute size (e.g., "2GiB", "512MB") or percentage of system memory (e.g., "50%").
## Setting it to 0 or "unlimited" disables the limit.
## @toml2docs:none-default="0"
#+ experimental_compaction_memory_limit = "0"
@@ -615,9 +630,6 @@ manifest_cache_size = "256MB"
## Buffer size for SST writing.
sst_write_buffer_size = "8MB"
## Capacity of the channel to send data from parallel scan tasks to the main task.
parallel_scan_channel_size = 32
## Maximum number of SST files to scan concurrently.
max_concurrent_scan_files = 384
@@ -627,17 +639,21 @@ allow_stale_entries = false
## Memory limit for table scans across all queries.
## Supports absolute size (e.g., "2GB") or percentage of system memory (e.g., "20%").
## Setting it to 0 disables the limit.
## NOTE: Works with max_concurrent_queries for tiered memory allocation.
## - If max_concurrent_queries is set: 70% of queries get full access, 30% get 70% access.
## - If max_concurrent_queries is 0 (unlimited): first 20 queries get full access, rest get 70% access.
scan_memory_limit = "50%"
## Controls what happens when a scan cannot get memory immediately.
## "fail" (default) fails fast and is the recommended option for most users.
## "wait" / "wait(<duration>)" waits for memory to become available. This is mainly
## for advanced tuning in bursty workloads where temporary contention is common and
## higher latency is acceptable.
## "wait" means "wait(10s)", not unlimited waiting.
scan_memory_on_exhausted = "fail"
## Minimum time interval between two compactions.
## To align with the old behavior, the default value is 0 (no restrictions).
min_compaction_interval = "0m"
## Whether to enable experimental flat format as the default format.
default_experimental_flat_format = false
## Whether to enable flat format as the default SST format.
default_flat_format = true
## The options for index in Mito engine.
[region_engine.mito.index]

View File

@@ -7,7 +7,7 @@ RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
# Install dependencies
RUN ulimit -n 1024000 && yum groupinstall -y 'Development Tools'
RUN yum groupinstall -y 'Development Tools'
RUN yum install -y epel-release \
openssl \
openssl-devel \

View File

@@ -85,8 +85,8 @@ services:
command:
- metasrv
- start
- --rpc-bind-addr=0.0.0.0:3002
- --rpc-server-addr=metasrv:3002
- --grpc-bind-addr=0.0.0.0:3002
- --grpc-server-addr=metasrv:3002
- --store-addrs=etcd0:2379
- --http-addr=0.0.0.0:3000
healthcheck:
@@ -111,8 +111,8 @@ services:
- start
- --node-id=0
- --data-home=/greptimedb_data
- --rpc-bind-addr=0.0.0.0:3001
- --rpc-server-addr=datanode0:3001
- --grpc-bind-addr=0.0.0.0:3001
- --grpc-server-addr=datanode0:3001
- --metasrv-addrs=metasrv:3002
- --http-addr=0.0.0.0:5000
volumes:
@@ -141,7 +141,7 @@ services:
- start
- --metasrv-addrs=metasrv:3002
- --http-addr=0.0.0.0:4000
- --rpc-bind-addr=0.0.0.0:4001
- --grpc-bind-addr=0.0.0.0:4001
- --mysql-addr=0.0.0.0:4002
- --postgres-addr=0.0.0.0:4003
healthcheck:
@@ -166,8 +166,8 @@ services:
- start
- --node-id=0
- --metasrv-addrs=metasrv:3002
- --rpc-bind-addr=0.0.0.0:4004
- --rpc-server-addr=flownode0:4004
- --grpc-bind-addr=0.0.0.0:4004
- --grpc-server-addr=flownode0:4004
- --http-addr=0.0.0.0:4005
depends_on:
frontend0:

View File

@@ -67,6 +67,7 @@ snapshot-20250101/
- Self-contained (all information needed for restore)
- Immutable (content never changes after creation)
- Verifiable (checksums at file, chunk, and snapshot levels)
- Schema-only snapshots contain only `manifest.json` and `schema/`; `data/` is absent, `chunks` is empty, and later data append is rejected (use `--force` to recreate)
### Chunk
@@ -116,6 +117,8 @@ greptime export create \
--schema-only \
--to s3://my-bucket/snapshots/prod-schema-only
Schema-only snapshots cannot be resumed with data; use `--force` to recreate.
# Export with specific format (default: parquet)
greptime export create \
--format csv \
@@ -173,7 +176,9 @@ The manifest is a JSON file containing snapshot metadata and chunk index:
- `snapshot_id`: Unique identifier (UUID)
- `catalog`, `schemas`: Catalog and schema list
- `time_range`: Overall time range covered
- `schema_only`: Whether the snapshot contains schema only
- `chunks[]`: Array of chunk metadata
- `format`: Data format for exported files
- `checksum`: Snapshot-level SHA256 checksum
**Chunk metadata structure**:
@@ -182,7 +187,7 @@ Each chunk entry in the manifest contains:
- `id`: Chunk identifier (sequential number)
- `time_range`: Start and end timestamps
- `status`: Export status (Pending, Completed, Failed)
- `status`: Export status (Pending, InProgress, Completed, Failed)
- `files`: List of data files in the chunk directory
- `checksum`: Chunk-level checksum for integrity verification
@@ -292,9 +297,9 @@ Checksums are verified during import before data is written to the database.
**Resume capability**:
- Manifest tracks chunk status (Pending, Completed, Failed)
- Manifest tracks chunk status (Pending, InProgress, Completed, Failed)
- Export/import automatically resumes when executed on existing snapshot
- Skips completed chunks, retries failed chunks, processes pending chunks
- Skips completed chunks, retries failed/in-progress chunks, processes pending chunks
- Works across process restarts
- Use `--force` (export only) to delete existing snapshot and start over

View File

@@ -0,0 +1,190 @@
---
Feature Name: Flow Batching Sequence-Based Incremental Query Plan (Lite)
Tracking Issue: TBD
Date: 2026-03-16
Author: @discord9
---
# Summary
This RFC proposes a correctness-first incremental query mode for Flow batching.
Flow queries can read only `seq > checkpoint` and advance checkpoints using per-region correctness watermarks.
When incremental reads are stale or correctness cannot be proven, Flow falls back to full recomputation.
# Motivation
Flow batching still needs to repeatedly compute old data in the same time window, so incremental query can improve Flow performance.
# Goals
1. Add opt-in incremental reads (`seq > given_seq`) for Flow.
2. Return per-region correctness watermarks for checkpoint advancement.
3. Keep existing query behavior unchanged unless explicitly enabled.
4. Define deterministic fallback for stale or unprovable incremental reads.
# Non-Goals
1. No business-schema changes (no synthetic watermark columns in result rows).
2. No global throughput optimization in v1 (correctness first).
3. No observational watermark output when correctness is unprovable.
# Proposal
## 1) Query options
Introduce three `QueryContext` extension keys:
- `flow.incremental_after_seqs`
- `flow.incremental_mode`
- `flow.return_region_seq`
These options are opt-in and only affect Flow incremental execution paths.
## 2) Scan mapping
When incremental mode is enabled:
- map `after_seq` to `memtable_min_sequence` (exclusive lower bound)
- keep existing snapshot upper-bound behavior (`memtable_max_sequence`)
Important limitation in v1:
- incremental filtering is correctness-proven only for memtable rows
- SST files do not preserve detailed row-level sequence metadata; they only expose coarser file-level sequence information
- therefore `seq > checkpoint` must not assume precise incremental pruning across memtable->SST flush boundaries
If required incremental parameters are missing or invalid, return argument error.
## 3) Stale protection
Add dedicated stale error:
- `IncrementalQueryStale { region_id, given_seq, min_readable_seq }`
Behavior:
- if `given_seq < min_readable_seq`, return stale error
- if `given_seq == min_readable_seq`, query is valid and reads `seq > given_seq`
- if `given_seq > min_readable_seq`, query is also valid and reads `seq > given_seq`
`IncrementalQueryStale` also covers the case where rows newer than the checkpoint have crossed a memtable->SST flush boundary and sequence-precise incremental exclusion can no longer be proven.
In other words, the flush-boundary case is not a separate fallback category in v1; it is one concrete way an incremental cursor becomes stale.
## 4) Watermark return
Extend query metrics with optional per-region watermark map:
- `region_latest_sequences: Vec<(region_id: u64, latest_sequence: u64)>`
Rules:
- only terminal metrics of successful query can advance checkpoints
- for multi-region query, watermark must be complete map or absent
- if correctness is unprovable, business rows may return but watermark is absent
## 5) Flow state machine
Checkpoint and watermark state are kept only in flownode memory in v1; they are not persisted as durable flow metadata.
Cold start or flownode restart therefore always re-enters through a full snapshot read.
Only after that full query succeeds with a complete correctness watermark may Flow switch back to incremental mode.
Flow starts in full mode, then transitions:
1. Full query succeeds with correctness watermark -> enter incremental mode
2. Incremental query succeeds with correctness watermark -> advance checkpoint
3. Incremental stale/failure -> fallback to full mode
4. Full query without correctness watermark -> remain in full mode
```mermaid
stateDiagram-v2
[*] --> FullSnapshot: Flow starts
state FullSnapshot {
[*] --> RunFull
RunFull --> RunFull: Full query succeeds but watermark is unprovable<br/>no region_latest_sequences returned
}
FullSnapshot --> Incremental: Full query succeeds and correctness watermark is returned<br/>(checkpoint updated)
state Incremental {
[*] --> RunInc
RunInc --> RunInc: Incremental succeeds<br/>(checkpoint advances)
}
Incremental --> FullSnapshot: IncrementalQueryStale<br/>(cursor too old, fallback required)
Incremental --> FullSnapshot: Incremental fails<br/> and fallback policy is triggered
FullSnapshot --> [*]: Flow stops
Incremental --> [*]: Flow stops
```
### Fallback Policy
Fallback to full mode is deterministic and is triggered by any of the following:
1. `IncrementalQueryStale` is returned.
2. Incremental query fails with execution errors.
3. Incremental query succeeds but watermark is absent or incomplete for participating regions.
Policy behavior:
1. Do not advance any checkpoint in the failed/incomplete round.
2. Switch to full mode for the affected flow/window in the next round.
3. Return to incremental mode only after a full query succeeds with a complete correctness watermark map.
### Persistence and recovery model
The v1 design is intentionally correctness-first and keeps the progress cursor lightweight:
1. Watermarks/checkpoints live only in flownode memory; v1 does not persist them separately.
2. On cold start, the flow re-establishes progress by running a successful full-query snapshot read, then resumes incremental mode only after that round returns a complete correctness watermark map.
3. Sequence-precise incremental correctness is currently limited to rows still visible in memtables.
4. Once relevant rows have been flushed into SST, the system cannot use `seq > checkpoint` alone to prove precise incremental exclusion, because SST lacks detailed row-level sequence metadata.
5. In that case the correct behavior is to fall back to full recomputation, not to continue a best-effort incremental scan.
# Distributed and Compatibility Requirements
1. Distributed path must preserve region-level snapshot/read-bound semantics end-to-end.
2. `snapshot_seqs` transport and `flow.*` options must both be carried correctly.
- `snapshot_seqs` means the per-region snapshot upper-bound map: `region_id -> sequence`.
3. New metrics fields must be backward-compatible (old clients ignore unknown fields).
# Rollout Plan
## Phase 1 (MVP, correctness first)
1. Add extension constants and parsing.
2. Add incremental scan mapping and stale detection.
3. Add watermark metrics field and terminal-watermark checkpoint update path.
4. Complete standalone and distributed passthrough.
## Phase 2 (performance and observability)
1. Improve batching key strategy with sequence/watermark context.
2. Optimize watermark serialization overhead.
3. Add metrics: incremental hit rate, fallback rate, fallback window size.
# Testing Plan
1. Unit tests for incremental bounds and stale detection.
2. Query-path tests for extension mapping and watermark semantics.
3. Flow integration tests for full->incremental->fallback transitions.
4. Distributed tests for end-to-end snapshot/watermark propagation.
5. Compatibility tests for old/new client-server combinations.
# Risks
1. Boundary semantic mismatch (`<` vs `<=`) may cause correctness bugs.
2. Incomplete distributed propagation can silently invalidate watermark safety.
3. Frequent fallback can reduce throughput before phase-2 optimizations.
4. Memtable->SST flushes may force more full recomputation than expected until finer-grained SST sequence tracking exists.
# Alternatives
1. Put watermark into business rows (rejected: schema pollution).
2. Add new dedicated Flight message type in v1 (deferred to reduce scope).
# Conclusion
This plan enables a practical, correctness-first incremental path for Flow batching.
It reuses existing sequence scan capability, adds strict stale handling, and advances checkpoints only from correctness-proven per-region watermarks.

View File

@@ -0,0 +1,136 @@
---
Feature Name: Remote Dynamic Filter Propagation
Tracking Issue: N/A
Date: 2026-04-08
Author: @discord9
---
# RFC: Remote Dynamic Filter Propagation
# Summary
This RFC proposes a remote dynamic filter propagation mechanism for distributed queries. It lets frontend-produced dynamic filters reach remote datanode scans through a lightweight control plane, while preserving one rule: remote dynamic filters are an optimization only, never a correctness dependency.
# Motivation
Today, dynamic filters can improve local execution, but they do not automatically propagate to remote datanode scans in distributed queries. As a result, the frontend may already know that a probe-side scan can be narrowed, while the remote scan still runs with a weaker predicate and loses pruning opportunities.
We want a minimal design that:
- propagates dynamic filter updates to remote scans,
- keeps filter identity and lifecycle stable across register/update/unregister,
- and safely degrades when encoding, routing, RPC, or apply logic fails.
# Details
The high-level flow is:
1. A join on the frontend produces an alive dynamic filter.
2. `MergeScanExec` identifies the remote subscribers, generates a stable `filter_id`, and registers the alive filter into a query-scoped registry.
3. The initial remote read establishes the corresponding registration on the datanode side.
4. The frontend registry watches for dynamic filter updates via `wait_update` or generation changes.
5. Later updates and unregister messages are sent through the existing region unary RPC path.
6. The datanode applies these updates to query-scoped remote filter state and scan wrappers.
7. Query finish, cancel, or no-consumer conditions trigger unregister and cleanup.
## Identity
The logical identity of a remote dynamic filter is `query_id + filter_id`.
Region and scan metadata are routing information, not part of filter-state identity. `filter_id` only needs to be stable and unique within a single query.
The current recommendation is to derive `filter_id` from:
- `region_id`
- `producer-local ordinal`
- `canonicalized children fingerprint`
The following should not be included:
- `partition`
- transport metadata
- memory addresses or temporary runtime object ids
## Transport
This design reuses the existing region unary gRPC path:
- `RegionRequest.body.remote_dyn_filter`
- `RemoteDynFilterRequest.oneof action`
- `update`
- `unregister`
The initial remote read is responsible for register and scan setup. The unary RPC path is only for later `update` and `unregister` messages.
## Frontend registry
The frontend uses a query-engine runtime map:
- implementation near `src/query/src/dist_plan/remote_dyn_filter_registry.rs`
- storage model: `query_id -> Arc<RemoteDynFilterRegistry>`
This registry should not live on a single `MergeScanExec`, and it should not be stored in `QueryContext.mutable_session_data`. It is a query execution runtime object that owns watcher tasks, cleanup tail, and fanout state.
The registry lifecycle has three states:
- `Active`: accepts registrations and sends updates
- `Closing`: query ended; stop new registrations, send final cleanup messages, drain in-flight RPCs
- `Closed`: watchers stopped, state removable from the runtime map
The registry may outlive the main query execution briefly for cleanup, but it is not intended to be a long-lived global object.
## Propagation policy
Remote dynamic filters should remain a selective optimization, not an automatic fanout for every filter update.
The frontend may skip remote propagation when the encoded filter becomes too large, fanout cost is too high, or the expected pruning benefit is too small. In those cases, execution should continue with local-only dynamic filtering semantics.
## Responsibilities
On the frontend:
- the join produces alive dynamic filters,
- `MergeScanExec` bridges producers to remote subscribers,
- the registry watches updates and fans out RPCs.
On the datanode:
- the unary handler receives `update` and `unregister`,
- query-scoped remote filter state is keyed by `query_id + filter_id`,
- remote wrappers apply updates through existing predicate and scan refresh paths.
## Failure semantics
All failures must degrade safely:
- encode failure -> local-only filter
- RPC failure -> log/metric and degrade
- early update or missing target -> explicit buffer, drop+metric, or retry policy
- decode or remap failure -> disable remote optimization only
# Alternatives
## Registry on `MergeScanExec`
Rejected because lifecycle and cleanup would become fragmented across multiple bridge or exec instances in the same query.
## Registry in `QueryContext.mutable_session_data`
Rejected because this is the wrong ownership model. The registry is not session metadata; it is a query runtime object with watcher tasks and cleanup behavior.
## Long-lived global manager
Rejected for now because it is heavier than necessary. A query-engine runtime map is sufficient for the current design.
# Drawbacks
- The design introduces extra query runtime state and cleanup logic on both frontend and datanode.
- The initial version only covers the current minimal filter form and leaves larger membership propagation to later work.
- A clear policy is still required for updates that arrive before scan registration.
# Unresolved questions
1. Should children fingerprint canonicalization become a shared helper?
2. What is the strict timing relationship between `is_complete` and final unregister?
3. Does the runtime map need a background sweep task, or is explicit reap enough?
4. How should large build-side membership evolve beyond `IN` in later work?

18
flake.lock generated
View File

@@ -8,11 +8,11 @@
"rust-analyzer-src": "rust-analyzer-src"
},
"locked": {
"lastModified": 1770794449,
"narHash": "sha256-1nFkhcZx9+Sdw5OXwJqp5TxvGncqRqLeK781v0XV3WI=",
"lastModified": 1774250935,
"narHash": "sha256-mWID0WFgTnd9hbEeaPNX+YYWF70JN3r7zBouEqERJOE=",
"owner": "nix-community",
"repo": "fenix",
"rev": "b19d93fdf9761e6101f8cb5765d638bacebd9a1b",
"rev": "64d7705e8c37d650cfb1aa99c24a8ce46597f29e",
"type": "github"
},
"original": {
@@ -41,11 +41,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1770617025,
"narHash": "sha256-1jZvgZoAagZZB6NwGRv2T2ezPy+X6EFDsJm+YSlsvEs=",
"lastModified": 1774244481,
"narHash": "sha256-4XfMXU0DjN83o6HWZoKG9PegCvKvIhNUnRUI19vzTcQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "2db38e08fdadcc0ce3232f7279bab59a15b94482",
"rev": "4590696c8693fea477850fe379a01544293ca4e2",
"type": "github"
},
"original": {
@@ -65,11 +65,11 @@
"rust-analyzer-src": {
"flake": false,
"locked": {
"lastModified": 1770702974,
"narHash": "sha256-CbvWu72rpGHK5QynoXwuOnVzxX7njF2LYgk8wRSiAQ0=",
"lastModified": 1774221325,
"narHash": "sha256-aEIdkqB8gtQZtEbogdUb5iyfcZpKIlD3FkG8ANu73/I=",
"owner": "rust-lang",
"repo": "rust-analyzer",
"rev": "07a594815f7c1d6e7e39f21ddeeedb75b21795f4",
"rev": "b42b63f390a4dab14e6efa34a70e67f5b087cc62",
"type": "github"
},
"original": {

View File

@@ -20,7 +20,7 @@
lib = nixpkgs.lib;
rustToolchain = fenix.packages.${system}.fromToolchainName {
name = (lib.importTOML ./rust-toolchain.toml).toolchain.channel;
sha256 = "sha256-GCGEXGZeJySLND0KU5TdtTrqFV76TF3UdvAHSUegSsk=";
sha256 = "sha256-rboGKQLH4eDuiY01SINOqmXUFUNr9F4awoFZGzib17o=";
};
in
{

File diff suppressed because it is too large Load Diff

View File

@@ -142,3 +142,87 @@
rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])
)` | `timeseries` | Elapsed time to persist trigger alert records. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99` |
| Save Alert Failure Rate | `rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when persisting trigger alert records. | `prometheus` | `none` | `__auto` |
# Hotspot
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
| --- | --- | --- | --- | --- | --- | --- |
| Hotspot Regions | `WITH table_stats AS (
SELECT
table_id,
COUNT(*) AS region_count,
SUM(disk_size) AS total_disk_size,
SUM(region_rows) as total_region_rows
FROM information_schema.region_statistics
WHERE region_role = 'Leader'
GROUP BY table_id
HAVING COUNT(*) > 1
)
SELECT
t.table_schema,
t.table_name,
r.region_id,
t.table_id,
r.region_number,
p.partition_description,
ROUND(
r.disk_size * 100.0
/ NULLIF(ts.total_disk_size, 0),
2
) AS disk_size_share_percent,
r.disk_size,
ROUND(
r.region_rows * 100.0
/ NULLIF(ts.total_region_rows, 0),
2
) AS region_rows_share_percent,
r.region_rows
FROM information_schema.region_statistics r
JOIN table_stats ts
ON r.table_id = ts.table_id
JOIN information_schema.tables t
ON r.table_id = t.table_id
LEFT JOIN information_schema.partitions p
ON p.table_schema = t.table_schema
AND p.table_name = t.table_name
AND p.greptime_partition_id = r.region_id
WHERE r.region_role = 'Leader'
ORDER BY region_rows_share_percent DESC
LIMIT 100;` | `table` | | `mysql` | -- | -- |
| Datanode Load(Write) | `greptime_datanode_history_load` | `timeseries` | Write load of each datanode over time. | `prometheus` | `binBps` | `datanode-{{datanode_id}}({{instance}})` |
| Datanode Load(Write) Distribution | `greptime_datanode_history_load` | `piechart` | Distribution of write load across datanodes. | `prometheus` | `binBps` | `datanode-{{datanode_id}}({{instance}})` |
| Datanode Data Distribution | `WITH leader_regions AS (
SELECT
CONCAT(
'datanode-',
p.peer_id,
' (',
p.peer_addr,
')'
) AS datanode,
r.disk_size
FROM information_schema.region_statistics r
JOIN information_schema.region_peers p
ON r.region_id = p.region_id
WHERE r.region_role = 'Leader'
AND p.is_leader = 'Yes'
)
SELECT
datanode,
COUNT(*) AS leader_region_count,
SUM(disk_size) AS data_size
FROM leader_regions
GROUP BY datanode
ORDER BY data_size DESC;` | `piechart` | Distribution of leader regions and data size across datanodes. | `mysql` | `bytes` | -- |

View File

@@ -1153,3 +1153,65 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: __auto
- title: Hotspot
panels:
- title: Hotspot Regions
type: table
queries:
- expr: "WITH table_stats AS (\n SELECT\n table_id,\n COUNT(*) AS region_count,\n SUM(disk_size) AS total_disk_size,\n SUM(region_rows) as total_region_rows\n FROM information_schema.region_statistics\n WHERE region_role = 'Leader'\n GROUP BY table_id\n HAVING COUNT(*) > 1\n)\n\nSELECT\n t.table_schema,\n t.table_name,\n\n r.region_id,\n t.table_id,\n r.region_number,\n\n p.partition_description,\n\n\n ROUND(\n r.disk_size * 100.0\n / NULLIF(ts.total_disk_size, 0),\n 2\n ) AS disk_size_share_percent,\n\n r.disk_size,\n \n ROUND(\n r.region_rows * 100.0\n / NULLIF(ts.total_region_rows, 0),\n 2\n ) AS region_rows_share_percent,\n r.region_rows\n\nFROM information_schema.region_statistics r\n\nJOIN table_stats ts\n ON r.table_id = ts.table_id\n\nJOIN information_schema.tables t\n ON r.table_id = t.table_id\n\nLEFT JOIN information_schema.partitions p\n ON p.table_schema = t.table_schema\n AND p.table_name = t.table_name\n AND p.greptime_partition_id = r.region_id\n\nWHERE r.region_role = 'Leader'\n\nORDER BY region_rows_share_percent DESC\nLIMIT 100;"
datasource:
type: mysql
uid: ${information_schema}
- title: Datanode Load(Write)
type: timeseries
description: Write load of each datanode over time.
unit: binBps
queries:
- expr: greptime_datanode_history_load
datasource:
type: prometheus
uid: ${metrics}
legendFormat: datanode-{{datanode_id}}({{instance}})
- title: Datanode Load(Write) Distribution
type: piechart
description: Distribution of write load across datanodes.
unit: binBps
queries:
- expr: greptime_datanode_history_load
datasource:
type: prometheus
uid: ${metrics}
legendFormat: datanode-{{datanode_id}}({{instance}})
- title: Datanode Data Distribution
type: piechart
description: Distribution of leader regions and data size across datanodes.
unit: bytes
queries:
- expr: |-
WITH leader_regions AS (
SELECT
CONCAT(
'datanode-',
p.peer_id,
' (',
p.peer_addr,
')'
) AS datanode,
r.disk_size
FROM information_schema.region_statistics r
JOIN information_schema.region_peers p
ON r.region_id = p.region_id
WHERE r.region_role = 'Leader'
AND p.is_leader = 'Yes'
)
SELECT
datanode,
COUNT(*) AS leader_region_count,
SUM(disk_size) AS data_size
FROM leader_regions
GROUP BY datanode
ORDER BY data_size DESC;
datasource:
type: mysql
uid: ${information_schema}

File diff suppressed because it is too large Load Diff

View File

@@ -142,3 +142,87 @@
rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])
)` | `timeseries` | Elapsed time to persist trigger alert records. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99` |
| Save Alert Failure Rate | `rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when persisting trigger alert records. | `prometheus` | `none` | `__auto` |
# Hotspot
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
| --- | --- | --- | --- | --- | --- | --- |
| Hotspot Regions | `WITH table_stats AS (
SELECT
table_id,
COUNT(*) AS region_count,
SUM(disk_size) AS total_disk_size,
SUM(region_rows) as total_region_rows
FROM information_schema.region_statistics
WHERE region_role = 'Leader'
GROUP BY table_id
HAVING COUNT(*) > 1
)
SELECT
t.table_schema,
t.table_name,
r.region_id,
t.table_id,
r.region_number,
p.partition_description,
ROUND(
r.disk_size * 100.0
/ NULLIF(ts.total_disk_size, 0),
2
) AS disk_size_share_percent,
r.disk_size,
ROUND(
r.region_rows * 100.0
/ NULLIF(ts.total_region_rows, 0),
2
) AS region_rows_share_percent,
r.region_rows
FROM information_schema.region_statistics r
JOIN table_stats ts
ON r.table_id = ts.table_id
JOIN information_schema.tables t
ON r.table_id = t.table_id
LEFT JOIN information_schema.partitions p
ON p.table_schema = t.table_schema
AND p.table_name = t.table_name
AND p.greptime_partition_id = r.region_id
WHERE r.region_role = 'Leader'
ORDER BY region_rows_share_percent DESC
LIMIT 100;` | `table` | | `mysql` | -- | -- |
| Datanode Load(Write) | `greptime_datanode_history_load` | `timeseries` | Write load of each datanode over time. | `prometheus` | `binBps` | `datanode-{{datanode_id}}({{instance}})` |
| Datanode Load(Write) Distribution | `greptime_datanode_history_load` | `piechart` | Distribution of write load across datanodes. | `prometheus` | `binBps` | `datanode-{{datanode_id}}({{instance}})` |
| Datanode Data Distribution | `WITH leader_regions AS (
SELECT
CONCAT(
'datanode-',
p.peer_id,
' (',
p.peer_addr,
')'
) AS datanode,
r.disk_size
FROM information_schema.region_statistics r
JOIN information_schema.region_peers p
ON r.region_id = p.region_id
WHERE r.region_role = 'Leader'
AND p.is_leader = 'Yes'
)
SELECT
datanode,
COUNT(*) AS leader_region_count,
SUM(disk_size) AS data_size
FROM leader_regions
GROUP BY datanode
ORDER BY data_size DESC;` | `piechart` | Distribution of leader regions and data size across datanodes. | `mysql` | `bytes` | -- |

View File

@@ -1153,3 +1153,65 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: __auto
- title: Hotspot
panels:
- title: Hotspot Regions
type: table
queries:
- expr: "WITH table_stats AS (\n SELECT\n table_id,\n COUNT(*) AS region_count,\n SUM(disk_size) AS total_disk_size,\n SUM(region_rows) as total_region_rows\n FROM information_schema.region_statistics\n WHERE region_role = 'Leader'\n GROUP BY table_id\n HAVING COUNT(*) > 1\n)\n\nSELECT\n t.table_schema,\n t.table_name,\n\n r.region_id,\n t.table_id,\n r.region_number,\n\n p.partition_description,\n\n\n ROUND(\n r.disk_size * 100.0\n / NULLIF(ts.total_disk_size, 0),\n 2\n ) AS disk_size_share_percent,\n\n r.disk_size,\n \n ROUND(\n r.region_rows * 100.0\n / NULLIF(ts.total_region_rows, 0),\n 2\n ) AS region_rows_share_percent,\n r.region_rows\n\nFROM information_schema.region_statistics r\n\nJOIN table_stats ts\n ON r.table_id = ts.table_id\n\nJOIN information_schema.tables t\n ON r.table_id = t.table_id\n\nLEFT JOIN information_schema.partitions p\n ON p.table_schema = t.table_schema\n AND p.table_name = t.table_name\n AND p.greptime_partition_id = r.region_id\n\nWHERE r.region_role = 'Leader'\n\nORDER BY region_rows_share_percent DESC\nLIMIT 100;"
datasource:
type: mysql
uid: ${information_schema}
- title: Datanode Load(Write)
type: timeseries
description: Write load of each datanode over time.
unit: binBps
queries:
- expr: greptime_datanode_history_load
datasource:
type: prometheus
uid: ${metrics}
legendFormat: datanode-{{datanode_id}}({{instance}})
- title: Datanode Load(Write) Distribution
type: piechart
description: Distribution of write load across datanodes.
unit: binBps
queries:
- expr: greptime_datanode_history_load
datasource:
type: prometheus
uid: ${metrics}
legendFormat: datanode-{{datanode_id}}({{instance}})
- title: Datanode Data Distribution
type: piechart
description: Distribution of leader regions and data size across datanodes.
unit: bytes
queries:
- expr: |-
WITH leader_regions AS (
SELECT
CONCAT(
'datanode-',
p.peer_id,
' (',
p.peer_addr,
')'
) AS datanode,
r.disk_size
FROM information_schema.region_statistics r
JOIN information_schema.region_peers p
ON r.region_id = p.region_id
WHERE r.region_role = 'Leader'
AND p.is_leader = 'Yes'
)
SELECT
datanode,
COUNT(*) AS leader_region_count,
SUM(disk_size) AS data_size
FROM leader_regions
GROUP BY datanode
ORDER BY data_size DESC;
datasource:
type: mysql
uid: ${information_schema}

View File

@@ -1,2 +1,2 @@
[toolchain]
channel = "nightly-2025-10-01"
channel = "nightly-2026-03-21"

View File

@@ -444,7 +444,7 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
JsonFormat::Jsonb => Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
}),
JsonFormat::Native(native_type) => {
JsonFormat::Json2(native_type) => {
if native_type.is_null() {
None
} else {
@@ -919,6 +919,7 @@ pub fn encode_json_value(value: JsonValue) -> v1::JsonValue {
.collect::<Vec<_>>();
Some(json_value::Value::Object(JsonObject { entries }))
}
JsonVariant::Variant(x) => Some(json_value::Value::Variant(x)),
};
v1::JsonValue { value }
}
@@ -952,6 +953,7 @@ fn decode_json_value(value: &v1::JsonValue) -> JsonValueRef<'_> {
})
.collect::<BTreeMap<_, _>>()
.into(),
json_value::Value::Variant(x) => x.as_slice().into(),
}
}

View File

@@ -12,8 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#![feature(assert_matches)]
use std::assert_matches::assert_matches;
use std::assert_matches;
use std::sync::Arc;
use api::v1::greptime_request::Request;

View File

@@ -9,6 +9,6 @@ catalog.workspace = true
common-error.workspace = true
common-macro.workspace = true
common-meta.workspace = true
moka.workspace = true
moka = { workspace = true, features = ["future"] }
partition.workspace = true
snafu.workspace = true

View File

@@ -20,6 +20,6 @@ mod table_cache;
pub use builder::{
CatalogManagerConfigurator, CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder,
};
pub use client::{CachedKvBackend, CachedKvBackendBuilder, MetaKvBackend};
pub use client::{CachedKvBackend, CachedKvBackendBuilder, new_read_only_meta_kv_backend};
pub use manager::KvBackendCatalogManager;
pub use table_cache::{TableCache, TableCacheRef, new_table_cache};

View File

@@ -21,7 +21,10 @@ use std::time::Duration;
use common_error::ext::BoxedError;
use common_meta::cache_invalidator::KvCacheInvalidator;
use common_meta::error::Error::CacheNotGet;
use common_meta::error::{CacheNotGetSnafu, Error, ExternalSnafu, GetKvCacheSnafu, Result};
use common_meta::error::{
CacheNotGetSnafu, Error, ExternalSnafu, GetKvCacheSnafu, Result, UnsupportedSnafu,
};
use common_meta::kv_backend::read_only::ReadOnlyKvBackend;
use common_meta::kv_backend::txn::{Txn, TxnResponse};
use common_meta::kv_backend::{KvBackend, KvBackendRef, TxnService};
use common_meta::rpc::KeyValue;
@@ -357,19 +360,35 @@ impl CachedKvBackend {
}
#[derive(Debug)]
pub struct MetaKvBackend {
pub client: Arc<MetaClient>,
pub(crate) struct MetaKvBackend {
client: Arc<MetaClient>,
}
impl MetaKvBackend {
/// Constructs a [MetaKvBackend].
pub fn new(client: Arc<MetaClient>) -> MetaKvBackend {
fn new(client: Arc<MetaClient>) -> MetaKvBackend {
MetaKvBackend { client }
}
}
pub fn new_read_only_meta_kv_backend(client: Arc<MetaClient>) -> KvBackendRef {
Arc::new(ReadOnlyKvBackend::new(Arc::new(MetaKvBackend::new(client))))
}
#[async_trait::async_trait]
impl TxnService for MetaKvBackend {
type Error = Error;
async fn txn(&self, _txn: Txn) -> Result<TxnResponse> {
UnsupportedSnafu {
operation: "MetaKvBackend txn",
}
.fail()
}
fn max_txn_ops(&self) -> usize {
usize::MAX
}
}
/// Implement `KvBackend` trait for `MetaKvBackend` instead of opendal's `Accessor` since
@@ -465,6 +484,9 @@ mod tests {
use std::sync::atomic::{AtomicU32, Ordering};
use async_trait::async_trait;
use common_meta::kv_backend::memory::MemoryKvBackend;
use common_meta::kv_backend::read_only::ReadOnlyKvBackend;
use common_meta::kv_backend::txn::{Txn, TxnOp};
use common_meta::kv_backend::{KvBackend, TxnService};
use common_meta::rpc::KeyValue;
use common_meta::rpc::store::{
@@ -473,8 +495,9 @@ mod tests {
PutResponse, RangeRequest, RangeResponse,
};
use dashmap::DashMap;
use meta_client::client::MetaClientBuilder;
use super::CachedKvBackend;
use super::{CachedKvBackend, new_read_only_meta_kv_backend};
#[derive(Default)]
pub struct SimpleKvBackend {
@@ -579,6 +602,62 @@ mod tests {
}
}
#[tokio::test]
async fn test_cached_kv_backend_rejects_writes_with_read_only_inner() {
let inner = Arc::new(MemoryKvBackend::<common_meta::error::Error>::new());
let cached_kv = CachedKvBackend::wrap(Arc::new(ReadOnlyKvBackend::new(inner)));
let err = cached_kv
.put(PutRequest {
key: b"k1".to_vec(),
value: b"v1".to_vec(),
prev_kv: false,
})
.await
.unwrap_err();
assert!(matches!(
err,
common_meta::error::Error::ReadOnlyKvBackend { .. }
));
}
#[tokio::test]
async fn test_read_only_meta_kv_backend_rejects_writes() {
let meta_client = Arc::new(MetaClientBuilder::frontend_default_options().build());
let backend = new_read_only_meta_kv_backend(meta_client);
let err = backend
.put(PutRequest {
key: b"k1".to_vec(),
value: b"v1".to_vec(),
prev_kv: false,
})
.await
.unwrap_err();
assert!(matches!(
err,
common_meta::error::Error::ReadOnlyKvBackend { .. }
));
}
#[tokio::test]
async fn test_read_only_meta_kv_backend_does_not_emulate_txn() {
let meta_client = Arc::new(MetaClientBuilder::frontend_default_options().build());
let backend = new_read_only_meta_kv_backend(meta_client);
let result = backend
.txn(Txn::new().and_then(vec![TxnOp::Get(b"k1".to_vec())]))
.await;
let err = match result {
Ok(_) => panic!("expected unsupported txn error"),
Err(err) => err,
};
assert!(matches!(err, common_meta::error::Error::Unsupported { .. }));
}
async fn add_some_vals(kv_backend: &impl KvBackend) {
kv_backend
.put(PutRequest {

View File

@@ -65,11 +65,13 @@ fn init_factory(
fn invalidator<'a>(
cache: &'a Cache<TableName, TableRef>,
ident: &'a CacheIdent,
idents: &'a [&CacheIdent],
) -> BoxFuture<'a, MetaResult<()>> {
Box::pin(async move {
if let CacheIdent::TableName(table_name) = ident {
cache.invalidate(table_name).await
for ident in idents {
if let CacheIdent::TableName(table_name) = ident {
cache.invalidate(table_name).await
}
}
Ok(())
})

View File

@@ -12,9 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#![feature(assert_matches)]
#![feature(try_blocks)]
use std::any::Any;
use std::fmt::{Debug, Formatter};
use std::sync::Arc;

View File

@@ -132,15 +132,13 @@ impl CatalogManager for MemoryCatalogManager {
table_name: &str,
_query_ctx: Option<&QueryContext>,
) -> Result<Option<TableRef>> {
let result = try {
self.catalogs
.read()
.unwrap()
.get(catalog)?
.get(schema)?
.get(table_name)
.cloned()?
};
let catalogs = self.catalogs.read().unwrap();
let result = catalogs
.get(catalog)
.and_then(|c| c.get(schema))
.and_then(|s| s.get(table_name))
.cloned();
Ok(result)
}
@@ -149,8 +147,8 @@ impl CatalogManager for MemoryCatalogManager {
.catalogs
.read()
.unwrap()
.iter()
.flat_map(|(_, schema_entries)| schema_entries.values())
.values()
.flat_map(|schema_entries| schema_entries.values())
.flat_map(|tables| tables.values())
.find(|t| t.table_info().ident.table_id == table_id)
.map(|t| t.table_info()))

View File

@@ -58,6 +58,8 @@ pub enum QueryStatement {
Sql(Statement),
// The optional string is the alias of the PromQL query.
Promql(EvalStmt, Option<String>),
/// Logical plan with original query string
Plan(String),
}
impl Display for QueryStatement {
@@ -71,6 +73,7 @@ impl Display for QueryStatement {
write!(f, "{}", eval_stmt)
}
}
QueryStatement::Plan(query) => write!(f, "{}", query),
}
}
}
@@ -170,7 +173,7 @@ impl ProcessManager {
let mut processes = vec![];
if let Some(remote_frontend_selector) = self.frontend_selector.as_ref() {
let frontends = remote_frontend_selector
.select(|node| node.peer.addr != self.server_addr)
.select(|peer| peer.addr != self.server_addr)
.await
.context(error::InvokeFrontendSnafu)?;
for mut f in frontends {
@@ -208,7 +211,7 @@ impl ProcessManager {
.frontend_selector
.as_ref()
.context(error::MetaClientMissingSnafu)?
.select(|node| node.peer.addr == server_addr)
.select(|peer| peer.addr == server_addr)
.await
.context(error::InvokeFrontendSnafu)?;
ensure!(
@@ -369,6 +372,9 @@ impl SlowQueryTimer {
QueryStatement::Sql(stmt) => {
slow_query_event.query = stmt.to_string();
}
QueryStatement::Plan(query) => {
slow_query_event.query = query.clone();
}
}
match self.record_type {
@@ -395,7 +401,7 @@ impl SlowQueryTimer {
impl Drop for SlowQueryTimer {
fn drop(&mut self) {
// Calculate the elaspsed duration since the timer is created.
// Calculate the elapsed duration since the timer is created.
let elapsed = self.start.elapsed();
if elapsed > self.threshold {
// Only capture a portion of slow queries based on sample_ratio.

View File

@@ -139,12 +139,16 @@ impl DataSource for SystemTableDataSource {
&self,
request: ScanRequest,
) -> std::result::Result<SendableRecordBatchStream, BoxedError> {
let projected_schema = match &request.projection {
let projection = request
.projection_input
.as_ref()
.map(|input| input.projection.clone());
let projected_schema = match projection.as_ref() {
Some(projection) => self.try_project(projection)?,
None => self.table.schema(),
};
let projection = request.projection.clone();
let stream = self
.table
.to_stream(request)

View File

@@ -16,6 +16,7 @@ use std::sync::{Arc, Weak};
use common_catalog::consts::INFORMATION_SCHEMA_FLOW_TABLE_ID;
use common_error::ext::BoxedError;
use common_meta::ddl::create_flow::FlowType;
use common_meta::key::FlowId;
use common_meta::key::flow::FlowMetadataManager;
use common_meta::key::flow::flow_info::FlowInfoValue;
@@ -71,6 +72,7 @@ pub const CREATED_TIME: &str = "created_time";
pub const UPDATED_TIME: &str = "updated_time";
pub const LAST_EXECUTION_TIME: &str = "last_execution_time";
pub const SOURCE_TABLE_NAMES: &str = "source_table_names";
pub const FLOWNODE_ADDRS: &str = "flownode_addrs";
/// The `information_schema.flows` to provides information about flows in databases.
#[derive(Debug)]
@@ -95,7 +97,8 @@ impl InformationSchemaFlows {
}
}
/// for complex fields(including [`SOURCE_TABLE_IDS`], [`FLOWNODE_IDS`] and [`OPTIONS`]), it will be serialized to json string for now
/// for complex fields(including [`SOURCE_TABLE_IDS`], [`FLOWNODE_IDS`], [`OPTIONS`] and
/// [`FLOWNODE_ADDRS`]), it will be serialized to json string for now
/// TODO(discord9): use a better way to store complex fields like json type
pub(crate) fn schema() -> SchemaRef {
Arc::new(Schema::new(
@@ -119,6 +122,7 @@ impl InformationSchemaFlows {
true,
),
(SOURCE_TABLE_NAMES, CDT::string_datatype(), true),
(FLOWNODE_ADDRS, CDT::string_datatype(), true),
]
.into_iter()
.map(|(name, ty, nullable)| ColumnSchema::new(name, ty, nullable))
@@ -165,6 +169,10 @@ impl InformationSchemaFlows {
expire_after: flow_info.expire_after(),
eval_interval: flow_info.eval_interval(),
comment,
flow_options: sql::statements::OptionMap::from_filtered_string_map(
flow_info.options(),
&[FlowType::FLOW_TYPE_KEY],
),
query,
};
@@ -239,6 +247,7 @@ struct InformationSchemaFlowsBuilder {
updated_time: TimestampMillisecondVectorBuilder,
last_execution_time: TimestampMillisecondVectorBuilder,
source_table_names: StringVectorBuilder,
flownode_addr_groups: StringVectorBuilder,
}
impl InformationSchemaFlowsBuilder {
@@ -269,6 +278,7 @@ impl InformationSchemaFlowsBuilder {
updated_time: TimestampMillisecondVectorBuilder::with_capacity(INIT_CAPACITY),
last_execution_time: TimestampMillisecondVectorBuilder::with_capacity(INIT_CAPACITY),
source_table_names: StringVectorBuilder::with_capacity(INIT_CAPACITY),
flownode_addr_groups: StringVectorBuilder::with_capacity(INIT_CAPACITY),
}
}
@@ -378,6 +388,21 @@ impl InformationSchemaFlowsBuilder {
.get(&flow_id)
.map(|v| TimestampMillisecond::new(*v))
}));
let flownode_addrs = self
.flow_metadata_manager
.flownode_addrs(flow_id)
.await
.map_err(BoxedError::new)
.context(InternalSnafu)?;
if flownode_addrs.is_empty() {
self.flownode_addr_groups.push(None);
} else {
let flownode_addrs_json =
serde_json::to_string(&flownode_addrs).with_context(|_| JsonSnafu {
input: format!("{:?}", flownode_addrs),
})?;
self.flownode_addr_groups.push(Some(&flownode_addrs_json));
}
let mut source_table_names = vec![];
let catalog_manager = self
@@ -413,6 +438,7 @@ impl InformationSchemaFlowsBuilder {
Arc::new(self.updated_time.finish()),
Arc::new(self.last_execution_time.finish()),
Arc::new(self.source_table_names.finish()),
Arc::new(self.flownode_addr_groups.finish()),
];
RecordBatch::new(self.schema.clone(), columns).context(CreateRecordBatchSnafu)
}

View File

@@ -267,7 +267,7 @@ impl InformationSchemaRegionPeersBuilder {
];
if !predicates.eval(&row) {
return;
continue;
}
self.table_catalogs.push(Some(table_catalog));
@@ -331,3 +331,87 @@ impl DfPartitionStream for InformationSchemaRegionPeers {
))
}
}
#[cfg(test)]
mod tests {
use api::v1::meta::Peer;
use arrow::array::AsArray;
use common_meta::rpc::router::{Region, RegionRoute};
use datafusion::common::ScalarValue;
use datafusion::logical_expr::{BinaryExpr, Expr, Operator, col};
use store_api::storage::{RegionId, ScanRequest};
use super::*;
fn new_region_route(table_id: u32, region_number: u32, peer_id: u64) -> RegionRoute {
RegionRoute {
region: Region {
id: RegionId::new(table_id, region_number),
..Default::default()
},
leader_peer: Some(Peer {
id: peer_id,
addr: format!("127.0.0.1:{}", 3000 + peer_id),
}),
follower_peers: vec![],
leader_state: None,
leader_down_since: None,
write_route_policy: None,
}
}
#[test]
fn test_add_region_peers_predicate_filters_correctly() {
let schema = InformationSchemaRegionPeers::schema();
let mut builder = InformationSchemaRegionPeersBuilder::new(
schema,
"greptime".to_string(),
Weak::<KvBackendCatalogManager>::new(),
);
let table_id = 1;
// 3 regions: region_number 0, 1, 2
let routes = vec![
new_region_route(table_id, 0, 1),
new_region_route(table_id, 1, 2),
new_region_route(table_id, 2, 3),
];
// Build a predicate that matches only the last region (region_number=2).
// With the old `return` bug, encountering the first non-matching region
// (region_number=0) would exit add_region_peers entirely, so region_number=2
// would never be found.
let target_region_id = RegionId::new(table_id, 2).as_u64();
let filter = Expr::BinaryExpr(BinaryExpr::new(
Box::new(col(REGION_ID)),
Operator::Eq,
Box::new(Expr::Literal(
ScalarValue::UInt64(Some(target_region_id)),
None,
)),
));
let request = ScanRequest {
filters: vec![filter],
..Default::default()
};
let predicates = Predicates::from_scan_request(&Some(request));
builder.add_region_peers(
"greptime",
"public",
"test_table",
&predicates,
table_id,
&routes,
);
let batch = builder.finish().unwrap();
// Should have exactly 1 row for the matching region
assert_eq!(batch.num_rows(), 1);
// Verify it's the correct region
let region_id_col = batch
.column(3)
.as_primitive::<arrow::datatypes::UInt64Type>();
assert_eq!(region_id_col.value(0), target_region_id);
}
}

View File

@@ -63,7 +63,7 @@ impl InformationTable for InformationSchemaSstsManifest {
}
fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
let schema = if let Some(p) = &request.projection {
let schema = if let Some(p) = request.projection_indices() {
Arc::new(self.schema.try_project(p).context(ProjectSchemaSnafu)?)
} else {
self.schema.clone()
@@ -117,7 +117,7 @@ impl InformationTable for InformationSchemaSstsStorage {
}
fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
let schema = if let Some(p) = &request.projection {
let schema = if let Some(p) = request.projection_indices() {
Arc::new(self.schema.try_project(p).context(ProjectSchemaSnafu)?)
} else {
self.schema.clone()
@@ -172,7 +172,7 @@ impl InformationTable for InformationSchemaSstsIndexMeta {
}
fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
let schema = if let Some(p) = &request.projection {
let schema = if let Some(p) = request.projection_indices() {
Arc::new(self.schema.try_project(p).context(ProjectSchemaSnafu)?)
} else {
self.schema.clone()

View File

@@ -372,22 +372,16 @@ impl InformationSchemaTablesBuilder {
self.table_types.push(Some(table_type_text));
self.table_ids.push(Some(table_id));
let data_length = region_stats.iter().map(|stat| stat.sst_size).sum();
let table_rows = region_stats.iter().map(|stat| stat.num_rows).sum();
let index_length = region_stats.iter().map(|stat| stat.index_size).sum();
let data_length: u64 = region_stats.iter().map(|stat| stat.sst_size).sum();
let table_rows: u64 = region_stats.iter().map(|stat| stat.num_rows).sum();
let index_length: u64 = region_stats.iter().map(|stat| stat.index_size).sum();
// It's not precise, but it is acceptable for long-term data storage.
let avg_row_length = if table_rows > 0 {
let total_data_length = data_length
+ region_stats
.iter()
.map(|stat| stat.memtable_size)
.sum::<u64>();
total_data_length / table_rows
} else {
0
};
let total_data_length: u64 = data_length
+ region_stats
.iter()
.map(|stat| stat.memtable_size)
.sum::<u64>();
let avg_row_length = total_data_length.checked_div(table_rows).unwrap_or(0);
self.data_length.push(Some(data_length));
self.index_length.push(Some(index_length));

View File

@@ -74,12 +74,10 @@ impl PGCatalogProvider {
)
.expect("Failed to initialize PgCatalogSchemaProvider");
let mut table_ids = HashMap::new();
let mut table_id = PG_CATALOG_TABLE_ID_START;
for name in PG_CATALOG_TABLES {
table_ids.insert(*name, table_id);
table_id += 1;
}
let table_ids: HashMap<_, _> = (PG_CATALOG_TABLE_ID_START..)
.zip(PG_CATALOG_TABLES.iter())
.map(|(id, name)| (*name, id))
.collect();
let mut provider = Self {
catalog_name,

View File

@@ -15,7 +15,6 @@
use std::collections::HashMap;
use std::sync::Arc;
use bytes::Bytes;
use common_catalog::format_full_table_name;
use common_query::logical_plan::{SubstraitPlanDecoderRef, rename_logical_plan_columns};
use datafusion::common::{ResolvedTableReference, TableReference};
@@ -151,7 +150,7 @@ impl DfTableSourceProvider {
let catalog_list = Arc::new(DummyCatalogList::new(self.catalog_manager.clone()));
let logical_plan = self
.plan_decoder
.decode(Bytes::from(view_info.view_info.clone()), catalog_list, true)
.decode(view_info.view_info.clone().into(), catalog_list, false)
.await
.context(DecodePlanSnafu {
name: &table.table_info().name,
@@ -191,7 +190,7 @@ impl DfTableSourceProvider {
plan_columns
.iter()
.map(|c| c.as_str())
.zip(columns.into_iter())
.zip(columns)
.collect(),
)
.context(ProjectViewColumnsSnafu)?

View File

@@ -44,6 +44,7 @@ common-version.workspace = true
common-wal.workspace = true
datatypes.workspace = true
etcd-client.workspace = true
fs2.workspace = true
futures.workspace = true
humantime.workspace = true
meta-client.workspace = true
@@ -65,6 +66,8 @@ store-api.workspace = true
table.workspace = true
tokio.workspace = true
tracing-appender.workspace = true
url.workspace = true
uuid.workspace = true
[dev-dependencies]
common-meta = { workspace = true, features = ["testing"] }
@@ -72,4 +75,3 @@ common-test-util.workspace = true
common-version.workspace = true
serde.workspace = true
tempfile.workspace = true
url.workspace = true

View File

@@ -220,18 +220,8 @@ impl PrefixedAzblobConnection {
name: "AzBlob",
required: [
(&self.azblob_container, "container"),
(&self.azblob_root, "root"),
(&self.azblob_account_name, "account name"),
(&self.azblob_endpoint, "endpoint"),
],
custom_validator: |missing: &mut Vec<&str>| {
// account_key is only required if sas_token is not provided
if self.azblob_sas_token.is_none()
&& self.azblob_account_key.is_empty()
{
missing.push("account key (when sas_token is not provided)");
}
}
]
)
}
}

View File

@@ -153,17 +153,11 @@ impl StoreConfig {
BackendImpl::PostgresStore => {
let table_name = &self.meta_table_name;
let tls_config = self.tls_config();
let pool = meta_srv::utils::postgres::create_postgres_pool(
Ok(meta_srv::utils::postgres::build_postgres_kv_backend(
store_addrs,
None,
tls_config,
)
.await
.map_err(BoxedError::new)?;
let schema_name = self.meta_schema_name.as_deref();
Ok(common_meta::kv_backend::rds::PgStore::with_pg_pool(
pool,
schema_name,
self.meta_schema_name.as_deref(),
table_name,
max_txn_ops,
self.auto_create_schema,
@@ -175,12 +169,9 @@ impl StoreConfig {
BackendImpl::MysqlStore => {
let table_name = &self.meta_table_name;
let tls_config = self.tls_config();
let pool =
meta_srv::utils::mysql::create_mysql_pool(store_addrs, tls_config.as_ref())
.await
.map_err(BoxedError::new)?;
Ok(common_meta::kv_backend::rds::MySqlStore::with_mysql_pool(
pool,
Ok(meta_srv::utils::mysql::build_mysql_kv_backend(
store_addrs,
tls_config.as_ref(),
table_name,
max_txn_ops,
)

View File

@@ -13,7 +13,12 @@
// limitations under the License.
mod export;
pub mod export_v2;
mod import;
pub mod import_v2;
pub(crate) mod path;
pub mod snapshot_storage;
pub(crate) mod sql;
mod storage_export;
use clap::Subcommand;
@@ -22,15 +27,24 @@ use common_error::ext::BoxedError;
use crate::Tool;
use crate::data::export::ExportCommand;
use crate::data::export_v2::ExportV2Command;
use crate::data::import::ImportCommand;
use crate::data::import_v2::ImportV2Command;
pub(crate) const COPY_PATH_PLACEHOLDER: &str = "<PATH/TO/FILES>";
/// Command for data operations including exporting data from and importing data into GreptimeDB.
#[derive(Subcommand)]
pub enum DataCommand {
/// Export data (V1 - legacy).
Export(ExportCommand),
/// Import data (V1 - legacy).
Import(ImportCommand),
/// Export V2 - JSON-based schema export with manifest support.
#[clap(subcommand)]
ExportV2(ExportV2Command),
/// Import V2 - Import from V2 snapshot.
ImportV2(ImportV2Command),
}
impl DataCommand {
@@ -38,6 +52,8 @@ impl DataCommand {
match self {
DataCommand::Export(cmd) => cmd.build().await,
DataCommand::Import(cmd) => cmd.build().await,
DataCommand::ExportV2(cmd) => cmd.build().await,
DataCommand::ImportV2(cmd) => cmd.build().await,
}
}
}

View File

@@ -107,13 +107,16 @@ pub struct ExportCommand {
#[clap(long, value_parser = humantime::parse_duration)]
timeout: Option<Duration>,
/// The proxy server address to connect, if set, will override the system proxy.
/// The proxy server address to connect.
///
/// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set.
/// If set, it overrides the system proxy unless `--no-proxy` is specified.
/// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
#[clap(long)]
proxy: Option<String>,
/// Disable proxy server, if set, will not use any proxy.
/// Disable all proxy usage (ignores `--proxy` and system proxy).
///
/// When set and `--proxy` is not provided, this explicitly disables system proxy.
#[clap(long)]
no_proxy: bool,
@@ -173,6 +176,7 @@ impl ExportCommand {
// Treats `None` as `0s` to disable server-side default timeout.
self.timeout.unwrap_or_default(),
proxy,
self.no_proxy,
);
Ok(Box::new(Export {
@@ -454,8 +458,10 @@ impl Export {
/// build operator with preference for file system
async fn build_prefer_fs_operator(&self) -> Result<ObjectStore> {
if self.storage_type.is_remote_storage() && self.ddl_local_dir.is_some() {
let root = self.ddl_local_dir.as_ref().unwrap().clone();
if self.storage_type.is_remote_storage()
&& let Some(ddl_local_dir) = &self.ddl_local_dir
{
let root = ddl_local_dir.clone();
let op = new_fs_object_store(&root).map_err(|e| Error::Other {
source: e,
location: snafu::location!(),
@@ -1078,7 +1084,7 @@ mod tests {
#[tokio::test]
async fn test_export_command_build_with_azblob_empty_account_name() {
// Test Azure Blob with empty account_name
// account_name is optional for Azure Blob validation
let cmd = ExportCommand::parse_from([
"export",
"--addr",
@@ -1086,30 +1092,19 @@ mod tests {
"--azblob",
"--azblob-container",
"test-container",
"--azblob-root",
"test-root",
"--azblob-account-name",
"", // Empty account name
"--azblob-account-key",
MOCK_AZBLOB_ACCOUNT_KEY_B64,
"--azblob-endpoint",
"https://account.blob.core.windows.net",
]);
let result = cmd.build().await;
assert!(result.is_err());
if let Err(err) = result {
assert!(
err.to_string().contains("AzBlob account name must be set"),
"Actual error: {}",
err
);
}
assert!(result.is_ok(), "Empty account_name should succeed");
}
#[tokio::test]
async fn test_export_command_build_with_azblob_missing_account_key() {
// Missing account key
// account_key is optional for Azure Blob validation
let cmd = ExportCommand::parse_from([
"export",
"--addr",
@@ -1117,24 +1112,12 @@ mod tests {
"--azblob",
"--azblob-container",
"test-container",
"--azblob-root",
"test-root",
"--azblob-account-name",
"test-account",
"--azblob-endpoint",
"https://account.blob.core.windows.net",
]);
let result = cmd.build().await;
assert!(result.is_err());
if let Err(err) = result {
assert!(
err.to_string()
.contains("AzBlob account key (when sas_token is not provided) must be set"),
"Actual error: {}",
err
);
}
assert!(result.is_ok(), "Missing account_key should succeed");
}
// ==================== Gap 3: Boundary cases ====================
@@ -1232,21 +1215,58 @@ mod tests {
"--azblob",
"--azblob-container",
"test-container",
"--azblob-root",
"test-root",
"--azblob-account-name",
"test-account",
"--azblob-account-key",
MOCK_AZBLOB_ACCOUNT_KEY_B64,
"--azblob-endpoint",
"https://account.blob.core.windows.net",
// No sas_token
]);
let result = cmd.build().await;
assert!(result.is_ok(), "Minimal AzBlob config should succeed");
}
#[tokio::test]
async fn test_export_command_build_with_azblob_missing_endpoint() {
let cmd = ExportCommand::parse_from([
"export",
"--addr",
"127.0.0.1:4000",
"--azblob",
"--azblob-container",
"test-container",
]);
let result = cmd.build().await;
assert!(result.is_err());
if let Err(err) = result {
assert!(
err.to_string().contains("AzBlob endpoint must be set"),
"Actual error: {}",
err
);
}
}
#[tokio::test]
async fn test_export_command_build_with_azblob_missing_container() {
let cmd = ExportCommand::parse_from([
"export",
"--addr",
"127.0.0.1:4000",
"--azblob",
"--azblob-endpoint",
"https://account.blob.core.windows.net",
]);
let result = cmd.build().await;
assert!(result.is_err());
if let Err(err) = result {
assert!(
err.to_string().contains("AzBlob container must be set"),
"Actual error: {}",
err
);
}
}
#[tokio::test]
async fn test_export_command_build_with_local_and_s3() {
// Both output-dir and S3 - S3 should take precedence
@@ -1281,7 +1301,7 @@ mod tests {
#[tokio::test]
async fn test_export_command_build_with_azblob_only_sas_token() {
// Azure Blob with sas_token but no account_key - should succeed
// Azure Blob with sas_token but no credentials - should still succeed
let cmd = ExportCommand::parse_from([
"export",
"--addr",
@@ -1289,15 +1309,10 @@ mod tests {
"--azblob",
"--azblob-container",
"test-container",
"--azblob-root",
"test-root",
"--azblob-account-name",
"test-account",
"--azblob-endpoint",
"https://account.blob.core.windows.net",
"--azblob-sas-token",
"test-sas-token",
// No account_key
]);
let result = cmd.build().await;
@@ -1318,10 +1333,6 @@ mod tests {
"--azblob",
"--azblob-container",
"test-container",
"--azblob-root",
"test-root",
"--azblob-account-name",
"test-account",
"--azblob-account-key",
"", // Empty account_key is OK if sas_token is provided
"--azblob-endpoint",

View File

@@ -0,0 +1,52 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Export V2 module.
//!
//! This module provides the V2 implementation of database export functionality,
//! featuring:
//! - JSON-based schema export (version-agnostic)
//! - Manifest-based snapshot management
//! - Support for multiple storage backends (S3, OSS, GCS, Azure Blob, local FS)
//! - Resume capability for interrupted exports
//!
//! # Example
//!
//! ```bash
//! # Export schema only
//! greptime cli data export-v2 create \
//! --addr 127.0.0.1:4000 \
//! --to file:///tmp/snapshot \
//! --schema-only
//!
//! # Export with time range
//! greptime cli data export-v2 create \
//! --addr 127.0.0.1:4000 \
//! --to s3://bucket/snapshots/prod-20250101 \
//! --start-time 2025-01-01T00:00:00Z \
//! --end-time 2025-01-31T23:59:59Z
//! ```
mod chunker;
mod command;
mod coordinator;
pub(crate) mod data;
pub mod error;
pub mod extractor;
pub mod manifest;
pub mod schema;
pub use command::ExportV2Command;
#[cfg(test)]
mod tests;

View File

@@ -0,0 +1,103 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Duration;
use chrono::Duration as ChronoDuration;
use crate::data::export_v2::manifest::{ChunkMeta, TimeRange};
pub fn generate_chunks(time_range: &TimeRange, window: Duration) -> Vec<ChunkMeta> {
let (Some(start), Some(end)) = (time_range.start, time_range.end) else {
return vec![ChunkMeta::new(1, time_range.clone())];
};
if start == end {
return vec![ChunkMeta::skipped(1, time_range.clone())];
}
if start > end {
return Vec::new();
}
let window = match ChronoDuration::from_std(window) {
Ok(window) if window > ChronoDuration::zero() => window,
_ => return vec![ChunkMeta::new(1, time_range.clone())],
};
let mut chunks = Vec::new();
let mut cursor = start;
let mut id = 1;
while cursor < end {
let next = cursor
.checked_add_signed(window)
.map_or(end, |timestamp| timestamp.min(end));
chunks.push(ChunkMeta::new(id, TimeRange::new(Some(cursor), Some(next))));
id += 1;
cursor = next;
}
chunks
}
#[cfg(test)]
mod tests {
use chrono::{TimeZone, Utc};
use super::*;
use crate::data::export_v2::manifest::ChunkStatus;
#[test]
fn test_generate_chunks_unbounded() {
let range = TimeRange::unbounded();
let chunks = generate_chunks(&range, Duration::from_secs(3600));
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].time_range, range);
}
#[test]
fn test_generate_chunks_split() {
let start = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
let end = Utc.with_ymd_and_hms(2025, 1, 1, 3, 0, 0).unwrap();
let range = TimeRange::new(Some(start), Some(end));
let chunks = generate_chunks(&range, Duration::from_secs(3600));
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0].time_range.start, Some(start));
assert_eq!(
chunks[2].time_range.end,
Some(Utc.with_ymd_and_hms(2025, 1, 1, 3, 0, 0).unwrap())
);
}
#[test]
fn test_generate_chunks_empty_range() {
let start = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
let range = TimeRange::new(Some(start), Some(start));
let chunks = generate_chunks(&range, Duration::from_secs(3600));
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].status, ChunkStatus::Skipped);
assert_eq!(chunks[0].time_range, range);
}
#[test]
fn test_generate_chunks_invalid_range_is_empty() {
let start = Utc.with_ymd_and_hms(2025, 1, 1, 1, 0, 0).unwrap();
let end = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
let range = TimeRange::new(Some(start), Some(end));
let chunks = generate_chunks(&range, Duration::from_secs(3600));
assert!(chunks.is_empty());
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,166 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_telemetry::info;
use crate::common::ObjectStoreConfig;
use crate::data::export_v2::data::{CopyOptions, build_copy_target, execute_copy_database};
use crate::data::export_v2::error::Result;
use crate::data::export_v2::manifest::{ChunkStatus, DataFormat, Manifest, TimeRange};
use crate::data::path::data_dir_for_schema_chunk;
use crate::data::snapshot_storage::{SnapshotStorage, StorageScheme};
use crate::database::DatabaseClient;
struct ExportContext<'a> {
storage: &'a dyn SnapshotStorage,
database_client: &'a DatabaseClient,
snapshot_uri: &'a str,
storage_config: &'a ObjectStoreConfig,
catalog: &'a str,
schemas: &'a [String],
format: DataFormat,
parallelism: usize,
}
pub async fn export_data(
storage: &dyn SnapshotStorage,
database_client: &DatabaseClient,
snapshot_uri: &str,
storage_config: &ObjectStoreConfig,
manifest: &mut Manifest,
parallelism: usize,
) -> Result<()> {
if manifest.chunks.is_empty() {
return Ok(());
}
for idx in 0..manifest.chunks.len() {
if matches!(
manifest.chunks[idx].status,
ChunkStatus::Completed | ChunkStatus::Skipped
) {
continue;
}
let (chunk_id, time_range) = mark_chunk_in_progress(manifest, idx);
manifest.touch();
storage.write_manifest(manifest).await?;
let context = ExportContext {
storage,
database_client,
snapshot_uri,
storage_config,
catalog: &manifest.catalog,
schemas: &manifest.schemas,
format: manifest.format,
parallelism,
};
let export_result = export_chunk(&context, chunk_id, time_range).await;
let result = match export_result {
Ok(files) => {
mark_chunk_completed(manifest, idx, files);
Ok(())
}
Err(err) => {
mark_chunk_failed(manifest, idx, err.to_string());
Err(err)
}
};
manifest.touch();
storage.write_manifest(manifest).await?;
result?;
}
Ok(())
}
fn mark_chunk_in_progress(manifest: &mut Manifest, idx: usize) -> (u32, TimeRange) {
let chunk = &mut manifest.chunks[idx];
chunk.mark_in_progress();
(chunk.id, chunk.time_range.clone())
}
fn mark_chunk_completed(manifest: &mut Manifest, idx: usize, files: Vec<String>) {
let chunk = &mut manifest.chunks[idx];
if files.is_empty() {
chunk.mark_skipped();
} else {
chunk.mark_completed(files, None);
}
}
fn mark_chunk_failed(manifest: &mut Manifest, idx: usize, error: String) {
let chunk = &mut manifest.chunks[idx];
chunk.mark_failed(error);
}
async fn export_chunk(
context: &ExportContext<'_>,
chunk_id: u32,
time_range: TimeRange,
) -> Result<Vec<String>> {
let scheme = StorageScheme::from_uri(context.snapshot_uri)?;
let needs_dir = matches!(scheme, StorageScheme::File);
let copy_options = CopyOptions {
format: context.format,
time_range,
parallelism: context.parallelism,
};
for schema in context.schemas {
let prefix = data_dir_for_schema_chunk(schema, chunk_id);
if needs_dir {
context.storage.create_dir_all(&prefix).await?;
}
let target = build_copy_target(
context.snapshot_uri,
context.storage_config,
schema,
chunk_id,
)?;
execute_copy_database(
context.database_client,
context.catalog,
schema,
&target,
&copy_options,
)
.await?;
}
let files = list_chunk_files(context.storage, context.schemas, chunk_id).await?;
info!("Collected {} files for chunk {}", files.len(), chunk_id);
Ok(files)
}
async fn list_chunk_files(
storage: &dyn SnapshotStorage,
schemas: &[String],
chunk_id: u32,
) -> Result<Vec<String>> {
let mut files = Vec::new();
for schema in schemas {
let prefix = data_dir_for_schema_chunk(schema, chunk_id);
files.extend(storage.list_files_recursive(&prefix).await?);
}
files.sort();
Ok(files)
}

View File

@@ -0,0 +1,538 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_base::secrets::{ExposeSecret, SecretString};
use common_telemetry::info;
use object_store::util::{join_path, normalize_path};
use snafu::ResultExt;
use url::Url;
use crate::common::ObjectStoreConfig;
use crate::data::export_v2::error::{DatabaseSnafu, InvalidUriSnafu, Result, UrlParseSnafu};
use crate::data::export_v2::manifest::{DataFormat, TimeRange};
use crate::data::path::data_dir_for_schema_chunk;
use crate::data::snapshot_storage::StorageScheme;
use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
use crate::database::DatabaseClient;
pub(super) struct CopyOptions {
pub(super) format: DataFormat,
pub(super) time_range: TimeRange,
pub(super) parallelism: usize,
}
pub(super) struct CopyTarget {
pub(super) location: String,
pub(super) connection: String,
secrets: Vec<Option<String>>,
}
pub(crate) struct CopySource {
pub(crate) location: String,
pub(crate) connection: String,
secrets: Vec<Option<String>>,
}
impl CopyTarget {
fn mask_sql(&self, sql: &str) -> String {
mask_secrets(sql, &self.secrets)
}
}
impl CopySource {
fn mask_sql(&self, sql: &str) -> String {
mask_secrets(sql, &self.secrets)
}
}
pub(super) fn build_copy_target(
snapshot_uri: &str,
storage: &ObjectStoreConfig,
schema: &str,
chunk_id: u32,
) -> Result<CopyTarget> {
let location = build_copy_location(snapshot_uri, storage, schema, chunk_id)?;
Ok(CopyTarget {
location: location.location,
connection: location.connection,
secrets: location.secrets,
})
}
pub(crate) fn build_copy_source(
snapshot_uri: &str,
storage: &ObjectStoreConfig,
schema: &str,
chunk_id: u32,
) -> Result<CopySource> {
let location = build_copy_location(snapshot_uri, storage, schema, chunk_id)?;
Ok(CopySource {
location: location.location,
connection: location.connection,
secrets: location.secrets,
})
}
struct CopyLocation {
location: String,
connection: String,
secrets: Vec<Option<String>>,
}
fn build_copy_location(
snapshot_uri: &str,
storage: &ObjectStoreConfig,
schema: &str,
chunk_id: u32,
) -> Result<CopyLocation> {
let url = Url::parse(snapshot_uri).context(UrlParseSnafu)?;
let scheme = StorageScheme::from_uri(snapshot_uri)?;
let suffix = data_dir_for_schema_chunk(schema, chunk_id);
match scheme {
StorageScheme::File => {
let root = url.to_file_path().map_err(|_| {
InvalidUriSnafu {
uri: snapshot_uri,
reason: "file:// URI must use an absolute path like file:///tmp/backup",
}
.build()
})?;
let location = normalize_path(&format!("{}/{}", root.to_string_lossy(), suffix));
Ok(CopyLocation {
location,
connection: String::new(),
secrets: Vec::new(),
})
}
StorageScheme::S3 => {
let (bucket, root) = extract_bucket_root(&url, snapshot_uri)?;
let location = format!("s3://{}/{}", bucket, join_root(&root, &suffix));
let (connection, secrets) = build_s3_connection(storage);
Ok(CopyLocation {
location,
connection,
secrets,
})
}
StorageScheme::Oss => {
let (bucket, root) = extract_bucket_root(&url, snapshot_uri)?;
let location = format!("oss://{}/{}", bucket, join_root(&root, &suffix));
let (connection, secrets) = build_oss_connection(storage);
Ok(CopyLocation {
location,
connection,
secrets,
})
}
StorageScheme::Gcs => {
let (bucket, root) = extract_bucket_root(&url, snapshot_uri)?;
let location = format!("gcs://{}/{}", bucket, join_root(&root, &suffix));
let (connection, secrets) = build_gcs_connection(storage, snapshot_uri)?;
Ok(CopyLocation {
location,
connection,
secrets,
})
}
StorageScheme::Azblob => {
let (bucket, root) = extract_bucket_root(&url, snapshot_uri)?;
let location = format!("azblob://{}/{}", bucket, join_root(&root, &suffix));
let (connection, secrets) = build_azblob_connection(storage);
Ok(CopyLocation {
location,
connection,
secrets,
})
}
}
}
pub(super) async fn execute_copy_database(
database_client: &DatabaseClient,
catalog: &str,
schema: &str,
target: &CopyTarget,
options: &CopyOptions,
) -> Result<()> {
let with_options = build_with_options(options);
let sql = format!(
r#"COPY DATABASE "{}"."{}" TO '{}' WITH ({}){};"#,
escape_sql_identifier(catalog),
escape_sql_identifier(schema),
escape_sql_literal(&target.location),
with_options,
target.connection
);
let safe_sql = target.mask_sql(&sql);
info!("Executing sql: {}", safe_sql);
database_client
.sql_in_public(&sql)
.await
.context(DatabaseSnafu)?;
Ok(())
}
pub(crate) async fn execute_copy_database_from(
database_client: &DatabaseClient,
catalog: &str,
schema: &str,
source: &CopySource,
format: DataFormat,
) -> Result<()> {
let sql = format!(
r#"COPY DATABASE "{}"."{}" FROM '{}' WITH (FORMAT='{}'){};"#,
escape_sql_identifier(catalog),
escape_sql_identifier(schema),
escape_sql_literal(&source.location),
format,
source.connection
);
let safe_sql = source.mask_sql(&sql);
info!("Executing sql: {}", safe_sql);
database_client
.sql_in_public(&sql)
.await
.context(DatabaseSnafu)?;
Ok(())
}
fn build_with_options(options: &CopyOptions) -> String {
let mut parts = vec![format!("FORMAT='{}'", options.format)];
if let Some(start) = options.time_range.start {
parts.push(format!(
"START_TIME='{}'",
escape_sql_literal(&start.to_rfc3339())
));
}
if let Some(end) = options.time_range.end {
parts.push(format!(
"END_TIME='{}'",
escape_sql_literal(&end.to_rfc3339())
));
}
parts.push(format!("PARALLELISM={}", options.parallelism));
parts.join(", ")
}
fn extract_bucket_root(url: &Url, snapshot_uri: &str) -> Result<(String, String)> {
let bucket = url.host_str().unwrap_or("").to_string();
if bucket.is_empty() {
return InvalidUriSnafu {
uri: snapshot_uri,
reason: "URI must include bucket/container in host",
}
.fail();
}
let root = url
.path()
.trim_start_matches('/')
.trim_end_matches('/')
.to_string();
Ok((bucket, root))
}
fn join_root(root: &str, suffix: &str) -> String {
join_path(root, suffix).trim_start_matches('/').to_string()
}
fn build_s3_connection(storage: &ObjectStoreConfig) -> (String, Vec<Option<String>>) {
let access_key_id = expose_optional_secret(&storage.s3.s3_access_key_id);
let secret_access_key = expose_optional_secret(&storage.s3.s3_secret_access_key);
let mut options = Vec::new();
if let Some(access_key_id) = &access_key_id {
options.push(format!(
"ACCESS_KEY_ID='{}'",
escape_sql_literal(access_key_id)
));
}
if let Some(secret_access_key) = &secret_access_key {
options.push(format!(
"SECRET_ACCESS_KEY='{}'",
escape_sql_literal(secret_access_key)
));
}
if let Some(region) = &storage.s3.s3_region {
options.push(format!("REGION='{}'", escape_sql_literal(region)));
}
if let Some(endpoint) = &storage.s3.s3_endpoint {
options.push(format!("ENDPOINT='{}'", escape_sql_literal(endpoint)));
}
let secrets = vec![access_key_id, secret_access_key];
let connection = if options.is_empty() {
String::new()
} else {
format!(" CONNECTION ({})", options.join(", "))
};
(connection, secrets)
}
fn build_oss_connection(storage: &ObjectStoreConfig) -> (String, Vec<Option<String>>) {
let access_key_id = expose_optional_secret(&storage.oss.oss_access_key_id);
let access_key_secret = expose_optional_secret(&storage.oss.oss_access_key_secret);
let mut options = Vec::new();
if let Some(access_key_id) = &access_key_id {
options.push(format!(
"ACCESS_KEY_ID='{}'",
escape_sql_literal(access_key_id)
));
}
if let Some(access_key_secret) = &access_key_secret {
options.push(format!(
"ACCESS_KEY_SECRET='{}'",
escape_sql_literal(access_key_secret)
));
}
if !storage.oss.oss_endpoint.is_empty() {
options.push(format!(
"ENDPOINT='{}'",
escape_sql_literal(&storage.oss.oss_endpoint)
));
}
let secrets = vec![access_key_id, access_key_secret];
let connection = if options.is_empty() {
String::new()
} else {
format!(" CONNECTION ({})", options.join(", "))
};
(connection, secrets)
}
fn build_gcs_connection(
storage: &ObjectStoreConfig,
snapshot_uri: &str,
) -> Result<(String, Vec<Option<String>>)> {
let credential_path = expose_optional_secret(&storage.gcs.gcs_credential_path);
let credential = expose_optional_secret(&storage.gcs.gcs_credential);
if credential.is_none() && credential_path.is_some() {
return InvalidUriSnafu {
uri: snapshot_uri,
reason: "gcs_credential_path is not supported for server-side COPY; provide gcs_credential or rely on server-side ADC",
}
.fail();
}
let mut options = Vec::new();
if let Some(credential) = &credential {
options.push(format!("CREDENTIAL='{}'", escape_sql_literal(credential)));
}
if !storage.gcs.gcs_scope.is_empty() {
options.push(format!(
"SCOPE='{}'",
escape_sql_literal(&storage.gcs.gcs_scope)
));
}
if !storage.gcs.gcs_endpoint.is_empty() {
options.push(format!(
"ENDPOINT='{}'",
escape_sql_literal(&storage.gcs.gcs_endpoint)
));
}
let connection = if options.is_empty() {
String::new()
} else {
format!(" CONNECTION ({})", options.join(", "))
};
let secrets = vec![credential_path, credential];
Ok((connection, secrets))
}
fn build_azblob_connection(storage: &ObjectStoreConfig) -> (String, Vec<Option<String>>) {
let account_name = expose_optional_secret(&storage.azblob.azblob_account_name);
let account_key = expose_optional_secret(&storage.azblob.azblob_account_key);
let sas_token = storage.azblob.azblob_sas_token.clone();
let mut options = Vec::new();
if let Some(account_name) = &account_name {
options.push(format!(
"ACCOUNT_NAME='{}'",
escape_sql_literal(account_name)
));
}
if let Some(account_key) = &account_key {
options.push(format!("ACCOUNT_KEY='{}'", escape_sql_literal(account_key)));
}
if let Some(sas_token) = &sas_token {
options.push(format!("SAS_TOKEN='{}'", escape_sql_literal(sas_token)));
}
if !storage.azblob.azblob_endpoint.is_empty() {
options.push(format!(
"ENDPOINT='{}'",
escape_sql_literal(&storage.azblob.azblob_endpoint)
));
}
let secrets = vec![account_name, account_key, sas_token];
let connection = if options.is_empty() {
String::new()
} else {
format!(" CONNECTION ({})", options.join(", "))
};
(connection, secrets)
}
fn expose_optional_secret(secret: &Option<SecretString>) -> Option<String> {
secret.as_ref().map(|s| s.expose_secret().to_owned())
}
fn mask_secrets(sql: &str, secrets: &[Option<String>]) -> String {
let mut masked = sql.to_string();
for secret in secrets {
if let Some(secret) = secret
&& !secret.is_empty()
{
let escaped = escape_sql_literal(secret);
if escaped != *secret {
masked = masked.replace(&escaped, "[REDACTED]");
}
masked = masked.replace(secret, "[REDACTED]");
}
}
masked
}
#[cfg(test)]
mod tests {
use common_base::secrets::SecretString;
use common_test_util::temp_dir::create_temp_dir;
use super::*;
use crate::common::{PrefixedAzblobConnection, PrefixedGcsConnection, PrefixedOssConnection};
#[test]
fn test_build_oss_connection_includes_endpoint() {
let storage = ObjectStoreConfig {
oss: PrefixedOssConnection {
oss_endpoint: "https://oss.example.com".to_string(),
oss_access_key_id: Some(SecretString::from("key_id".to_string())),
oss_access_key_secret: Some(SecretString::from("key_secret".to_string())),
..Default::default()
},
..Default::default()
};
let (connection, _) = build_oss_connection(&storage);
assert!(connection.contains("ENDPOINT='https://oss.example.com'"));
}
#[test]
fn test_build_gcs_connection_uses_scope_and_inline_credential() {
let storage = ObjectStoreConfig {
gcs: PrefixedGcsConnection {
gcs_scope: "scope-a".to_string(),
gcs_endpoint: "https://storage.googleapis.com".to_string(),
gcs_credential: Some(SecretString::from("credential-json".to_string())),
..Default::default()
},
..Default::default()
};
let (connection, _) = build_gcs_connection(&storage, "gcs://bucket/root").unwrap();
assert!(connection.contains("CREDENTIAL='credential-json'"));
assert!(connection.contains("SCOPE='scope-a'"));
assert!(connection.contains("ENDPOINT='https://storage.googleapis.com'"));
assert!(!connection.contains("CREDENTIAL_PATH"));
}
#[test]
fn test_build_gcs_connection_rejects_credential_path_only() {
let storage = ObjectStoreConfig {
gcs: PrefixedGcsConnection {
gcs_scope: "scope-a".to_string(),
gcs_credential_path: Some(SecretString::from("/tmp/creds.json".to_string())),
..Default::default()
},
..Default::default()
};
let error = build_gcs_connection(&storage, "gcs://bucket/root")
.expect_err("credential_path-only should be rejected")
.to_string();
assert!(error.contains("gcs_credential_path is not supported"));
}
#[test]
fn test_build_azblob_connection_includes_endpoint() {
let storage = ObjectStoreConfig {
azblob: PrefixedAzblobConnection {
azblob_account_name: Some(SecretString::from("account".to_string())),
azblob_account_key: Some(SecretString::from("key".to_string())),
azblob_endpoint: "https://blob.example.com".to_string(),
..Default::default()
},
..Default::default()
};
let (connection, _) = build_azblob_connection(&storage);
assert!(connection.contains("ENDPOINT='https://blob.example.com'"));
}
#[test]
fn test_build_azblob_connection_redacts_sas_token() {
let storage = ObjectStoreConfig {
azblob: PrefixedAzblobConnection {
azblob_account_name: Some(SecretString::from("account".to_string())),
azblob_account_key: Some(SecretString::from("key".to_string())),
azblob_sas_token: Some("sig=secret-token".to_string()),
..Default::default()
},
..Default::default()
};
let (connection, secrets) = build_azblob_connection(&storage);
let masked = mask_secrets(&connection, &secrets);
assert!(connection.contains("SAS_TOKEN='sig=secret-token'"));
assert!(masked.contains("SAS_TOKEN='[REDACTED]'"));
assert!(!masked.contains("sig=secret-token"));
}
#[test]
fn test_mask_secrets_redacts_sql_escaped_literals() {
let sql =
"COPY DATABASE \"greptime\".\"public\" TO 's3://bucket' CONNECTION (SECRET='ab''cd');";
let masked = mask_secrets(sql, &[Some("ab'cd".to_string())]);
assert!(!masked.contains("ab'cd"));
assert!(!masked.contains("ab''cd"));
assert!(masked.contains("SECRET='[REDACTED]'"));
}
#[test]
fn test_build_copy_target_decodes_file_uri_path() {
let storage = ObjectStoreConfig::default();
let snapshot_root = create_temp_dir("my backup");
let snapshot_uri = Url::from_file_path(snapshot_root.path())
.expect("absolute platform path should convert to file:// URI")
.to_string();
let expected = normalize_path(&format!(
"{}/{}",
snapshot_root.path().to_string_lossy(),
data_dir_for_schema_chunk("public", 7)
));
let target = build_copy_target(&snapshot_uri, &storage, "public", 7)
.expect("file:// copy target should be built");
assert!(snapshot_uri.contains("%20"));
assert!(!target.location.contains("%20"));
assert!(target.location.contains("my backup"));
assert_eq!(target.location, expected);
}
}

View File

@@ -0,0 +1,223 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use snafu::{Location, Snafu};
#[derive(Snafu)]
#[snafu(visibility(pub))]
#[stack_trace_debug]
pub enum Error {
#[snafu(display("Invalid URI '{}': {}", uri, reason))]
InvalidUri {
uri: String,
reason: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Unsupported storage scheme: {}", scheme))]
UnsupportedScheme {
scheme: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Storage operation '{}' failed", operation))]
StorageOperation {
operation: String,
#[snafu(source)]
error: object_store::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to parse manifest"))]
ManifestParse {
#[snafu(source)]
error: serde_json::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to serialize manifest"))]
ManifestSerialize {
#[snafu(source)]
error: serde_json::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to decode text file as UTF-8"))]
TextDecode {
#[snafu(source)]
error: std::string::FromUtf8Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Cannot resume snapshot with a different schema_only mode (existing: {}, requested: {}). Use --force to recreate.",
existing_schema_only,
requested_schema_only
))]
SchemaOnlyModeMismatch {
existing_schema_only: bool,
requested_schema_only: bool,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Cannot resume snapshot with different {} (existing: {}, requested: {}). Use --force to recreate.",
field,
existing,
requested
))]
ResumeConfigMismatch {
field: String,
existing: String,
requested: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to parse time: invalid format: {}", input))]
TimeParseInvalidFormat {
input: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to parse time: end_time is before start_time"))]
TimeParseEndBeforeStart {
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"chunk_time_window requires both --start-time and --end-time to be specified"
))]
ChunkTimeWindowRequiresBounds {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("--schema-only cannot be used with data export arguments: {}", args))]
SchemaOnlyArgsNotAllowed {
args: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Empty result from query"))]
EmptyResult {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Unexpected value type in query result"))]
UnexpectedValueType {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Database error"))]
Database {
#[snafu(source)]
error: crate::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Snapshot not found at '{}'", uri))]
SnapshotNotFound {
uri: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Schema '{}' not found in catalog '{}'", schema, catalog))]
SchemaNotFound {
catalog: String,
schema: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to parse URL"))]
UrlParse {
#[snafu(source)]
error: url::ParseError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to build object store"))]
BuildObjectStore {
#[snafu(source)]
error: object_store::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))]
ManifestVersionMismatch {
expected: u32,
found: u32,
#[snafu(implicit)]
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
impl ErrorExt for Error {
fn status_code(&self) -> StatusCode {
match self {
Error::InvalidUri { .. }
| Error::UnsupportedScheme { .. }
| Error::SchemaOnlyModeMismatch { .. }
| Error::ResumeConfigMismatch { .. }
| Error::ManifestVersionMismatch { .. }
| Error::SchemaOnlyArgsNotAllowed { .. } => StatusCode::InvalidArguments,
Error::TimeParseInvalidFormat { .. }
| Error::TimeParseEndBeforeStart { .. }
| Error::ChunkTimeWindowRequiresBounds { .. } => StatusCode::InvalidArguments,
Error::StorageOperation { .. }
| Error::ManifestParse { .. }
| Error::ManifestSerialize { .. }
| Error::TextDecode { .. }
| Error::BuildObjectStore { .. } => StatusCode::StorageUnavailable,
Error::EmptyResult { .. }
| Error::UnexpectedValueType { .. }
| Error::UrlParse { .. } => StatusCode::Internal,
Error::Database { error, .. } => error.status_code(),
Error::SnapshotNotFound { .. } => StatusCode::InvalidArguments,
Error::SchemaNotFound { .. } => StatusCode::DatabaseNotFound,
}
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -0,0 +1,254 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Schema extraction from information_schema.
//!
//! For V2 DDL-only snapshots, extractor only persists the schema index.
use std::collections::{HashMap, HashSet};
use serde_json::Value;
use snafu::ResultExt;
use crate::data::export_v2::error::{
DatabaseSnafu, EmptyResultSnafu, Result, SchemaNotFoundSnafu, UnexpectedValueTypeSnafu,
};
use crate::data::export_v2::schema::{SchemaDefinition, SchemaSnapshot};
use crate::data::sql::escape_sql_literal;
use crate::database::DatabaseClient;
/// System schemas that should be excluded from export.
const SYSTEM_SCHEMAS: &[&str] = &["information_schema", "pg_catalog"];
/// Extracts schema definitions from information_schema.
pub struct SchemaExtractor<'a> {
client: &'a DatabaseClient,
catalog: &'a str,
}
impl<'a> SchemaExtractor<'a> {
/// Creates a new schema extractor.
pub fn new(client: &'a DatabaseClient, catalog: &'a str) -> Self {
Self { client, catalog }
}
/// Extracts the schema index for the given schemas.
///
/// If `schemas` is None, extracts all non-system schemas.
pub async fn extract(&self, schemas: Option<&[String]>) -> Result<SchemaSnapshot> {
let mut snapshot = SchemaSnapshot::new();
let schema_names = match schemas {
Some(names) => self.validate_schemas(names).await?,
None => self.get_all_schemas().await?,
};
for schema_name in &schema_names {
let schema_def = self.extract_schema_definition(schema_name).await?;
snapshot.add_schema(schema_def);
}
Ok(snapshot)
}
/// Gets all non-system schemas in the catalog.
async fn get_all_schemas(&self) -> Result<Vec<String>> {
let sql = format!(
"SELECT schema_name FROM information_schema.schemata \
WHERE catalog_name = '{}'",
escape_sql_literal(self.catalog)
);
let records = self.query(&sql).await?;
let mut schemas = Vec::new();
for row in records {
let name = extract_string(&row, 0)?;
if !SYSTEM_SCHEMAS.contains(&name.as_str()) {
schemas.push(name);
}
}
Ok(schemas)
}
/// Validates that all specified schemas exist.
async fn validate_schemas(&self, schemas: &[String]) -> Result<Vec<String>> {
let all_schemas = self.get_all_schemas().await?;
dedupe_canonicalized_schemas(schemas, &all_schemas, self.catalog)
}
/// Extracts schema (database) definition.
async fn extract_schema_definition(&self, schema: &str) -> Result<SchemaDefinition> {
let sql = format!(
"SELECT schema_name, options FROM information_schema.schemata \
WHERE catalog_name = '{}' AND schema_name = '{}'",
escape_sql_literal(self.catalog),
escape_sql_literal(schema)
);
let records = self.query(&sql).await?;
if records.is_empty() {
return SchemaNotFoundSnafu {
catalog: self.catalog,
schema,
}
.fail();
}
let name = extract_string(&records[0], 0)?;
let options = extract_optional_string(&records[0], 1)
.map(|opts| parse_options(&opts))
.unwrap_or_default();
Ok(SchemaDefinition {
catalog: self.catalog.to_string(),
name,
options,
})
}
/// Executes a SQL query and returns the results.
async fn query(&self, sql: &str) -> Result<Vec<Vec<Value>>> {
self.client
.sql_in_public(sql)
.await
.context(DatabaseSnafu)?
.ok_or_else(|| EmptyResultSnafu.build())
}
}
/// Extracts a string value from a row.
fn extract_string(row: &[Value], index: usize) -> Result<String> {
match row.get(index) {
Some(Value::String(s)) => Ok(s.clone()),
Some(Value::Null) => UnexpectedValueTypeSnafu.fail(),
_ => UnexpectedValueTypeSnafu.fail(),
}
}
/// Extracts an optional string value from a row.
fn extract_optional_string(row: &[Value], index: usize) -> Option<String> {
match row.get(index) {
Some(Value::String(s)) if !s.is_empty() => Some(s.clone()),
_ => None,
}
}
/// Parses options string into a HashMap.
fn parse_options(options_str: &str) -> HashMap<String, String> {
if let Ok(map) = serde_json::from_str::<HashMap<String, String>>(options_str) {
return map;
}
let mut options = HashMap::new();
for line in options_str.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if let Some((key, value)) = parse_quoted_option_line(trimmed) {
options.insert(key, value);
continue;
}
for part in trimmed.split_whitespace() {
if let Some((key, value)) = part.split_once('=') {
options.insert(key.to_string(), value.to_string());
}
}
}
options
}
fn parse_quoted_option_line(line: &str) -> Option<(String, String)> {
let key = line.strip_prefix('\'')?;
let (key, rest) = key.split_once("'='")?;
let value = rest.strip_suffix('\'')?;
Some((key.to_string(), value.to_string()))
}
fn dedupe_canonicalized_schemas(
requested: &[String],
available: &[String],
catalog: &str,
) -> Result<Vec<String>> {
let mut canonicalized = Vec::new();
let mut seen = HashSet::new();
for schema in requested {
let Some(canonical) = available.iter().find(|s| s.eq_ignore_ascii_case(schema)) else {
return SchemaNotFoundSnafu { catalog, schema }.fail();
};
if seen.insert(canonical.to_ascii_lowercase()) {
canonicalized.push(canonical.clone());
}
}
Ok(canonicalized)
}
#[cfg(test)]
mod tests {
use serde_json::Value;
use super::*;
#[test]
fn test_parse_options_json() {
let opts = r#"{"ttl": "30d", "custom": "value"}"#;
let parsed = parse_options(opts);
assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
assert_eq!(parsed.get("custom"), Some(&"value".to_string()));
}
#[test]
fn test_parse_options_key_value() {
let opts = "ttl=30d custom=value";
let parsed = parse_options(opts);
assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
assert_eq!(parsed.get("custom"), Some(&"value".to_string()));
}
#[test]
fn test_parse_options_schema_display_format() {
let opts = "'ttl'='30d'\n'custom'='value with spaces'\n";
let parsed = parse_options(opts);
assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
assert_eq!(parsed.get("custom"), Some(&"value with spaces".to_string()));
}
#[test]
fn test_extract_string_rejects_null() {
let row = vec![Value::Null];
assert!(extract_string(&row, 0).is_err());
}
#[test]
fn test_dedupe_canonicalized_schemas() {
let available = vec!["public".to_string(), "test_db".to_string()];
let requested = vec![
"PUBLIC".to_string(),
"public".to_string(),
"Test_Db".to_string(),
];
let canonicalized = dedupe_canonicalized_schemas(&requested, &available, "greptime")
.expect("schemas should be canonicalized");
assert_eq!(canonicalized, vec!["public", "test_db"]);
}
}

View File

@@ -0,0 +1,569 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Manifest data structures for Export/Import V2.
use std::time::Duration;
use std::{fmt, str};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use crate::data::export_v2::chunker::generate_chunks;
use crate::data::export_v2::error::{
ChunkTimeWindowRequiresBoundsSnafu, Result as ExportResult, TimeParseEndBeforeStartSnafu,
TimeParseInvalidFormatSnafu,
};
/// Current manifest format version.
pub const MANIFEST_VERSION: u32 = 1;
/// Manifest file name within snapshot directory.
pub const MANIFEST_FILE: &str = "manifest.json";
/// Time range for data export (half-open interval: [start, end)).
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct TimeRange {
/// Start time (inclusive). None means earliest available data.
#[serde(skip_serializing_if = "Option::is_none")]
pub start: Option<DateTime<Utc>>,
/// End time (exclusive). None means current time.
#[serde(skip_serializing_if = "Option::is_none")]
pub end: Option<DateTime<Utc>>,
}
impl TimeRange {
/// Creates a new time range with specified bounds.
pub fn new(start: Option<DateTime<Utc>>, end: Option<DateTime<Utc>>) -> Self {
Self { start, end }
}
/// Creates an unbounded time range (all data).
pub fn unbounded() -> Self {
Self {
start: None,
end: None,
}
}
/// Returns true if this time range is unbounded.
pub fn is_unbounded(&self) -> bool {
self.start.is_none() && self.end.is_none()
}
/// Returns true if both bounds are specified.
pub fn is_bounded(&self) -> bool {
self.start.is_some() && self.end.is_some()
}
/// Parses a time range from optional RFC3339 strings.
pub fn parse(start: Option<&str>, end: Option<&str>) -> ExportResult<Self> {
let start = start.map(parse_time).transpose()?;
let end = end.map(parse_time).transpose()?;
if let (Some(start), Some(end)) = (start, end)
&& end < start
{
return TimeParseEndBeforeStartSnafu.fail();
}
Ok(Self::new(start, end))
}
}
fn parse_time(input: &str) -> ExportResult<DateTime<Utc>> {
DateTime::parse_from_rfc3339(input)
.map(|dt| dt.with_timezone(&Utc))
.map_err(|_| TimeParseInvalidFormatSnafu { input }.build())
}
impl Default for TimeRange {
fn default() -> Self {
Self::unbounded()
}
}
/// Status of a chunk during export/import.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "snake_case")]
pub enum ChunkStatus {
/// Chunk is pending export.
#[default]
Pending,
/// Chunk export is in progress.
InProgress,
/// Chunk export completed successfully.
Completed,
/// Chunk had no data to export.
Skipped,
/// Chunk export failed.
Failed,
}
/// Metadata for a single chunk of exported data.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkMeta {
/// Chunk identifier (sequential number starting from 1).
pub id: u32,
/// Time range covered by this chunk.
pub time_range: TimeRange,
/// Export status.
pub status: ChunkStatus,
/// List of data files in this chunk (relative paths from snapshot root).
#[serde(default)]
pub files: Vec<String>,
/// SHA256 checksum of all files in this chunk (aggregated).
#[serde(skip_serializing_if = "Option::is_none")]
pub checksum: Option<String>,
/// Error message if status is Failed.
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
impl ChunkMeta {
/// Creates a new pending chunk with the given id and time range.
pub fn new(id: u32, time_range: TimeRange) -> Self {
Self {
id,
time_range,
status: ChunkStatus::Pending,
files: vec![],
checksum: None,
error: None,
}
}
/// Creates a skipped chunk with the given id and time range.
pub fn skipped(id: u32, time_range: TimeRange) -> Self {
let mut chunk = Self::new(id, time_range);
chunk.mark_skipped();
chunk
}
/// Marks this chunk as in progress.
pub fn mark_in_progress(&mut self) {
self.status = ChunkStatus::InProgress;
self.error = None;
}
/// Marks this chunk as completed with the given files and checksum.
pub fn mark_completed(&mut self, files: Vec<String>, checksum: Option<String>) {
self.status = ChunkStatus::Completed;
self.files = files;
self.checksum = checksum;
self.error = None;
}
/// Marks this chunk as skipped because no data files were produced.
pub fn mark_skipped(&mut self) {
self.status = ChunkStatus::Skipped;
self.files.clear();
self.checksum = None;
self.error = None;
}
/// Marks this chunk as failed with the given error message.
pub fn mark_failed(&mut self, error: String) {
self.status = ChunkStatus::Failed;
self.error = Some(error);
}
}
/// Supported data formats for export.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default, clap::ValueEnum)]
#[serde(rename_all = "lowercase")]
#[value(rename_all = "lowercase")]
pub enum DataFormat {
/// Apache Parquet format (default, recommended for production).
#[default]
Parquet,
/// CSV format (human-readable).
Csv,
/// JSON format (structured text).
Json,
}
impl fmt::Display for DataFormat {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
DataFormat::Parquet => write!(f, "parquet"),
DataFormat::Csv => write!(f, "csv"),
DataFormat::Json => write!(f, "json"),
}
}
}
impl str::FromStr for DataFormat {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"parquet" => Ok(DataFormat::Parquet),
"csv" => Ok(DataFormat::Csv),
"json" => Ok(DataFormat::Json),
_ => Err(format!(
"invalid format '{}': expected one of parquet, csv, json",
s
)),
}
}
}
/// Snapshot manifest containing all metadata.
///
/// The manifest is stored as `manifest.json` in the snapshot root directory.
/// It contains:
/// - Snapshot identification (UUID, timestamps)
/// - Scope (catalog, schemas, time range)
/// - Export configuration (format, schema_only)
/// - Chunk metadata for resume support
/// - Integrity checksums
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Manifest {
/// Manifest format version for compatibility checking.
pub version: u32,
/// Unique snapshot identifier.
pub snapshot_id: Uuid,
/// Catalog name.
pub catalog: String,
/// List of schemas included in this snapshot.
pub schemas: Vec<String>,
/// Overall time range covered by this snapshot.
pub time_range: TimeRange,
/// Whether this is a schema-only snapshot (no data).
pub schema_only: bool,
/// Data format used for export.
pub format: DataFormat,
/// Chunk metadata (empty for schema-only snapshots).
#[serde(default)]
pub chunks: Vec<ChunkMeta>,
/// Snapshot-level SHA256 checksum (aggregated from all chunks).
#[serde(skip_serializing_if = "Option::is_none")]
pub checksum: Option<String>,
/// Creation timestamp.
pub created_at: DateTime<Utc>,
/// Last updated timestamp.
pub updated_at: DateTime<Utc>,
}
impl Manifest {
pub fn new_for_export(
catalog: String,
schemas: Vec<String>,
schema_only: bool,
time_range: TimeRange,
format: DataFormat,
chunk_time_window: Option<Duration>,
) -> ExportResult<Self> {
if chunk_time_window.is_some() && !time_range.is_bounded() {
return ChunkTimeWindowRequiresBoundsSnafu.fail();
}
let mut manifest = if schema_only {
Self::new_schema_only(catalog, schemas)
} else {
Self::new_full(catalog, schemas, time_range, format)
};
if !schema_only {
manifest.chunks = match chunk_time_window {
Some(window) => generate_chunks(&manifest.time_range, window),
None => generate_single_chunk(&manifest.time_range),
};
manifest.touch();
}
Ok(manifest)
}
/// Creates a new manifest for schema-only export.
pub fn new_schema_only(catalog: String, schemas: Vec<String>) -> Self {
let now = Utc::now();
Self {
version: MANIFEST_VERSION,
snapshot_id: Uuid::new_v4(),
catalog,
schemas,
time_range: TimeRange::unbounded(),
schema_only: true,
format: DataFormat::Parquet,
chunks: vec![],
checksum: None,
created_at: now,
updated_at: now,
}
}
/// Creates a new manifest for full export with time range and format.
pub fn new_full(
catalog: String,
schemas: Vec<String>,
time_range: TimeRange,
format: DataFormat,
) -> Self {
let now = Utc::now();
Self {
version: MANIFEST_VERSION,
snapshot_id: Uuid::new_v4(),
catalog,
schemas,
time_range,
schema_only: false,
format,
chunks: vec![],
checksum: None,
created_at: now,
updated_at: now,
}
}
/// Returns true if all chunks are completed (or if schema-only).
pub fn is_complete(&self) -> bool {
self.schema_only
|| (!self.chunks.is_empty()
&& self
.chunks
.iter()
.all(|c| matches!(c.status, ChunkStatus::Completed | ChunkStatus::Skipped)))
}
/// Returns the number of pending chunks.
pub fn pending_count(&self) -> usize {
self.chunks
.iter()
.filter(|c| c.status == ChunkStatus::Pending)
.count()
}
/// Returns the number of in-progress chunks.
pub fn in_progress_count(&self) -> usize {
self.chunks
.iter()
.filter(|c| c.status == ChunkStatus::InProgress)
.count()
}
/// Returns the number of completed chunks.
pub fn completed_count(&self) -> usize {
self.chunks
.iter()
.filter(|c| c.status == ChunkStatus::Completed)
.count()
}
/// Returns the number of skipped chunks.
pub fn skipped_count(&self) -> usize {
self.chunks
.iter()
.filter(|c| c.status == ChunkStatus::Skipped)
.count()
}
/// Returns the number of failed chunks.
pub fn failed_count(&self) -> usize {
self.chunks
.iter()
.filter(|c| c.status == ChunkStatus::Failed)
.count()
}
/// Updates the `updated_at` timestamp to now.
pub fn touch(&mut self) {
self.updated_at = Utc::now();
}
/// Adds a chunk to the manifest.
pub fn add_chunk(&mut self, chunk: ChunkMeta) {
self.chunks.push(chunk);
self.touch();
}
/// Updates a chunk by id.
pub fn update_chunk(&mut self, id: u32, updater: impl FnOnce(&mut ChunkMeta)) {
if let Some(chunk) = self.chunks.iter_mut().find(|c| c.id == id) {
updater(chunk);
self.touch();
}
}
}
fn generate_single_chunk(time_range: &TimeRange) -> Vec<ChunkMeta> {
if let (Some(start), Some(end)) = (time_range.start, time_range.end) {
if start == end {
return vec![ChunkMeta::skipped(1, time_range.clone())];
}
if start > end {
return Vec::new();
}
}
vec![ChunkMeta::new(1, time_range.clone())]
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use chrono::{TimeZone, Utc};
use super::*;
#[test]
fn test_time_range_serialization() {
let range = TimeRange::unbounded();
let json = serde_json::to_string(&range).unwrap();
assert_eq!(json, "{}");
let range: TimeRange = serde_json::from_str("{}").unwrap();
assert!(range.is_unbounded());
}
#[test]
fn test_manifest_schema_only() {
let manifest =
Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]);
assert_eq!(manifest.version, MANIFEST_VERSION);
assert!(manifest.schema_only);
assert!(manifest.chunks.is_empty());
assert!(manifest.is_complete());
}
#[test]
fn test_generate_single_chunk_zero_width_range_is_skipped() {
let ts = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
let chunks = generate_single_chunk(&TimeRange::new(Some(ts), Some(ts)));
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].status, ChunkStatus::Skipped);
assert_eq!(chunks[0].time_range.start, Some(ts));
assert_eq!(chunks[0].time_range.end, Some(ts));
}
#[test]
fn test_generate_single_chunk_invalid_range_is_empty() {
let start = Utc.with_ymd_and_hms(2025, 1, 1, 1, 0, 0).unwrap();
let end = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
let chunks = generate_single_chunk(&TimeRange::new(Some(start), Some(end)));
assert!(chunks.is_empty());
}
#[test]
fn test_manifest_full() {
let manifest = Manifest::new_full(
"greptime".to_string(),
vec!["public".to_string()],
TimeRange::unbounded(),
DataFormat::Parquet,
);
assert!(!manifest.schema_only);
assert!(manifest.chunks.is_empty());
assert!(!manifest.is_complete());
}
#[test]
fn test_data_format_parsing() {
assert_eq!(
"parquet".parse::<DataFormat>().unwrap(),
DataFormat::Parquet
);
assert_eq!("CSV".parse::<DataFormat>().unwrap(), DataFormat::Csv);
assert_eq!("JSON".parse::<DataFormat>().unwrap(), DataFormat::Json);
assert!("invalid".parse::<DataFormat>().is_err());
}
#[test]
fn test_chunk_status_transitions() {
let mut chunk = ChunkMeta::new(1, TimeRange::unbounded());
assert_eq!(chunk.status, ChunkStatus::Pending);
chunk.mark_in_progress();
assert_eq!(chunk.status, ChunkStatus::InProgress);
chunk.mark_completed(
vec!["file1.parquet".to_string()],
Some("abc123".to_string()),
);
assert_eq!(chunk.status, ChunkStatus::Completed);
assert_eq!(chunk.files.len(), 1);
chunk.mark_skipped();
assert_eq!(chunk.status, ChunkStatus::Skipped);
assert!(chunk.files.is_empty());
}
#[test]
fn test_manifest_is_complete_when_chunks_are_completed_or_skipped() {
let mut manifest = Manifest::new_full(
"greptime".to_string(),
vec!["public".to_string()],
TimeRange::unbounded(),
DataFormat::Parquet,
);
manifest.add_chunk(ChunkMeta::new(1, TimeRange::unbounded()));
manifest.add_chunk(ChunkMeta::new(2, TimeRange::unbounded()));
manifest.update_chunk(1, |chunk| {
chunk.mark_completed(vec!["a.parquet".to_string()], None)
});
manifest.update_chunk(2, |chunk| chunk.mark_skipped());
assert!(manifest.is_complete());
assert_eq!(manifest.completed_count(), 1);
assert_eq!(manifest.skipped_count(), 1);
}
#[test]
fn test_manifest_chunk_time_window_none_single_chunk() {
let start = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
let end = Utc.with_ymd_and_hms(2025, 1, 2, 0, 0, 0).unwrap();
let range = TimeRange::new(Some(start), Some(end));
let manifest = Manifest::new_for_export(
"greptime".to_string(),
vec!["public".to_string()],
false,
range.clone(),
DataFormat::Parquet,
None,
)
.unwrap();
assert_eq!(manifest.chunks.len(), 1);
assert_eq!(manifest.chunks[0].time_range, range);
}
#[test]
fn test_time_range_parse_requires_order() {
let result = TimeRange::parse(Some("2025-01-02T00:00:00Z"), Some("2025-01-01T00:00:00Z"));
assert!(result.is_err());
}
#[test]
fn test_new_for_export_with_chunk_window_requires_bounded_range() {
let result = Manifest::new_for_export(
"greptime".to_string(),
vec!["public".to_string()],
false,
TimeRange::new(
None,
Some(Utc.with_ymd_and_hms(2025, 1, 2, 0, 0, 0).unwrap()),
),
DataFormat::Parquet,
Some(Duration::from_secs(3600)),
);
assert!(result.is_err());
}
}

View File

@@ -0,0 +1,98 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Minimal schema index structures for Export/Import V2.
//!
//! The canonical schema representation is the per-schema DDL file under
//! `schema/ddl/`. `schemas.json` only records which schemas exist in a snapshot.
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
/// Schema directory name within snapshot.
pub const SCHEMA_DIR: &str = "schema";
/// DDL directory name within schema directory.
pub const DDL_DIR: &str = "ddl";
/// Schema definition file name.
pub const SCHEMAS_FILE: &str = "schemas.json";
/// Schema (database) definition.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SchemaDefinition {
/// Catalog name.
pub catalog: String,
/// Schema (database) name.
pub name: String,
/// Schema options (if any).
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub options: HashMap<String, String>,
}
/// Minimal schema index stored in a snapshot.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct SchemaSnapshot {
/// Schema (database) definitions.
pub schemas: Vec<SchemaDefinition>,
}
impl SchemaSnapshot {
/// Creates an empty schema snapshot.
pub fn new() -> Self {
Self::default()
}
/// Adds a schema definition.
pub fn add_schema(&mut self, schema: SchemaDefinition) {
self.schemas.push(schema);
}
/// Filters the snapshot to only include specified schemas.
pub fn filter_schemas(&self, schemas: &[String]) -> Self {
Self {
schemas: self
.schemas
.iter()
.filter(|s| schemas.contains(&s.name))
.cloned()
.collect(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_schema_snapshot_filter() {
let mut snapshot = SchemaSnapshot::new();
snapshot.add_schema(SchemaDefinition {
catalog: "greptime".to_string(),
name: "public".to_string(),
options: HashMap::new(),
});
snapshot.add_schema(SchemaDefinition {
catalog: "greptime".to_string(),
name: "private".to_string(),
options: HashMap::new(),
});
let filtered = snapshot.filter_schemas(&["public".to_string()]);
assert_eq!(filtered.schemas.len(), 1);
assert_eq!(filtered.schemas[0].name, "public");
}
}

View File

@@ -0,0 +1,885 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::env;
use std::time::Duration;
use clap::Parser;
use common_error::ext::BoxedError;
use serde_json::Value;
use snafu::ResultExt;
use tempfile::tempdir;
use url::Url;
use super::command::ExportCreateCommand;
use crate::common::ObjectStoreConfig;
use crate::data::export_v2::manifest::ChunkStatus;
use crate::data::import_v2::ImportV2Command;
use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage};
use crate::data::sql::escape_sql_identifier;
use crate::database::DatabaseClient;
use crate::error::{FileIoSnafu, InvalidArgumentsSnafu, OtherSnafu, Result};
async fn query_count(database_client: &DatabaseClient, schema: &str, table: &str) -> Result<u64> {
let sql = format!("SELECT COUNT(*) FROM {}", escape_sql_identifier(table));
let rows = database_client.sql(&sql, schema).await?;
let first_row = rows.as_ref().and_then(|rows| rows.first()).ok_or_else(|| {
InvalidArgumentsSnafu {
msg: format!("empty result for query: {sql}"),
}
.build()
})?;
let first_value = first_row.first().ok_or_else(|| {
InvalidArgumentsSnafu {
msg: format!("no first column for query: {sql}"),
}
.build()
})?;
match first_value {
Value::Number(n) => n.as_u64().ok_or_else(|| {
InvalidArgumentsSnafu {
msg: format!("count is not u64 for query: {sql}"),
}
.build()
}),
_ => InvalidArgumentsSnafu {
msg: format!("unexpected count type for query: {sql}"),
}
.fail(),
}
}
#[tokio::test]
#[ignore]
async fn export_import_v2_schema_parity_e2e() -> Result<()> {
let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
let schema = "test_db_schema_parity";
let database_client = DatabaseClient::new(
addr.clone(),
catalog.clone(),
auth_basic.clone(),
Duration::from_secs(60),
None,
false,
);
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
database_client
.sql_in_public(&format!("CREATE DATABASE {schema}"))
.await?;
database_client
.sql(
"CREATE TABLE metrics (\
ts TIMESTAMP TIME INDEX, \
host STRING PRIMARY KEY, \
cpu DOUBLE DEFAULT 0.0, \
region_name STRING \
) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')",
schema,
)
.await?;
database_client
.sql(
"CREATE TABLE logs (\
ts TIMESTAMP TIME INDEX, \
app STRING PRIMARY KEY, \
msg STRING NOT NULL COMMENT 'log message' \
) ENGINE = mito",
schema,
)
.await?;
database_client
.sql(
"CREATE TABLE metrics_physical (\
ts TIMESTAMP TIME INDEX, \
host STRING, \
region_name STRING, \
cpu DOUBLE DEFAULT 0.0, \
PRIMARY KEY (host, region_name) \
) ENGINE = metric WITH (physical_metric_table='true')",
schema,
)
.await?;
database_client
.sql(
"CREATE TABLE metrics_logical (\
ts TIMESTAMP TIME INDEX, \
host STRING, \
region_name STRING, \
cpu DOUBLE DEFAULT 0.0, \
PRIMARY KEY (host, region_name) \
) ENGINE = metric WITH (on_physical_table='metrics_physical')",
schema,
)
.await?;
database_client
.sql(
"CREATE VIEW metrics_view AS SELECT * FROM metrics WHERE cpu > 0.5",
schema,
)
.await?;
let src_dir = tempdir().context(FileIoSnafu)?;
let src_uri = Url::from_directory_path(src_dir.path())
.map_err(|_| {
InvalidArgumentsSnafu {
msg: "invalid temp dir path".to_string(),
}
.build()
})?
.to_string();
let mut export_args = vec![
"export-v2-create",
"--addr",
&addr,
"--to",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
"--schema-only",
];
if let Some(auth) = &auth_basic {
export_args.push("--auth-basic");
export_args.push(auth);
}
let export_cmd = ExportCreateCommand::parse_from(export_args);
export_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
database_client
.sql_in_public(&format!("DROP DATABASE {schema}"))
.await?;
let mut import_args = vec![
"import-v2",
"--addr",
&addr,
"--from",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
];
if let Some(auth) = &auth_basic {
import_args.push("--auth-basic");
import_args.push(auth);
}
let import_cmd = ImportV2Command::parse_from(import_args);
import_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
let dst_dir = tempdir().context(FileIoSnafu)?;
let dst_uri = Url::from_directory_path(dst_dir.path())
.map_err(|_| {
InvalidArgumentsSnafu {
msg: "invalid temp dir path".to_string(),
}
.build()
})?
.to_string();
let mut export_args = vec![
"export-v2-create",
"--addr",
&addr,
"--to",
&dst_uri,
"--catalog",
&catalog,
"--schemas",
schema,
"--schema-only",
];
if let Some(auth) = &auth_basic {
export_args.push("--auth-basic");
export_args.push(auth);
}
let export_cmd = ExportCreateCommand::parse_from(export_args);
export_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
let storage_config = ObjectStoreConfig::default();
let src_storage = OpenDalStorage::from_uri(&src_uri, &storage_config)
.map_err(BoxedError::new)
.context(OtherSnafu)?;
let dst_storage = OpenDalStorage::from_uri(&dst_uri, &storage_config)
.map_err(BoxedError::new)
.context(OtherSnafu)?;
let src_schema_snapshot = src_storage
.read_schema()
.await
.map_err(BoxedError::new)
.context(OtherSnafu)?;
let dst_schema_snapshot = dst_storage
.read_schema()
.await
.map_err(BoxedError::new)
.context(OtherSnafu)?;
assert_eq!(src_schema_snapshot, dst_schema_snapshot);
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
Ok(())
}
#[tokio::test]
#[ignore]
async fn import_v2_ddl_dry_run_e2e() -> Result<()> {
let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
let schema = "test_db_ddl_dry_run";
let database_client = DatabaseClient::new(
addr.clone(),
catalog.clone(),
auth_basic.clone(),
Duration::from_secs(60),
None,
false,
);
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
database_client
.sql_in_public(&format!("CREATE DATABASE {schema}"))
.await?;
database_client
.sql(
"CREATE TABLE metrics (\
ts TIMESTAMP TIME INDEX, \
host STRING PRIMARY KEY, \
cpu DOUBLE DEFAULT 0.0, \
region_name STRING \
) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')",
schema,
)
.await?;
database_client
.sql(
"CREATE TABLE logs (\
ts TIMESTAMP TIME INDEX, \
app STRING PRIMARY KEY, \
msg STRING NOT NULL COMMENT 'log message' \
) ENGINE = mito",
schema,
)
.await?;
let src_dir = tempdir().context(FileIoSnafu)?;
let src_uri = Url::from_directory_path(src_dir.path())
.map_err(|_| {
InvalidArgumentsSnafu {
msg: "invalid temp dir path".to_string(),
}
.build()
})?
.to_string();
let mut export_args = vec![
"export-v2-create",
"--addr",
&addr,
"--to",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
"--schema-only",
];
if let Some(auth) = &auth_basic {
export_args.push("--auth-basic");
export_args.push(auth);
}
let export_cmd = ExportCreateCommand::parse_from(export_args);
export_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
let mut import_args = vec![
"import-v2",
"--addr",
&addr,
"--from",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
"--dry-run",
];
if let Some(auth) = &auth_basic {
import_args.push("--auth-basic");
import_args.push(auth);
}
let import_cmd = ImportV2Command::parse_from(import_args);
import_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
Ok(())
}
#[tokio::test]
#[ignore]
async fn export_import_v2_data_roundtrip_e2e() -> Result<()> {
let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
let schema = "test_db_data_roundtrip";
let database_client = DatabaseClient::new(
addr.clone(),
catalog.clone(),
auth_basic.clone(),
Duration::from_secs(60),
None,
false,
);
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
database_client
.sql_in_public(&format!("CREATE DATABASE {schema}"))
.await?;
database_client
.sql(
"CREATE TABLE metrics (\
ts TIMESTAMP TIME INDEX, \
host STRING PRIMARY KEY, \
cpu DOUBLE \
) ENGINE=mito",
schema,
)
.await?;
database_client
.sql(
"INSERT INTO metrics (ts, host, cpu) VALUES \
('2025-01-01T00:00:00Z', 'h1', 1.0), \
('2025-01-01T01:00:00Z', 'h2', 2.0)",
schema,
)
.await?;
let expected_rows = query_count(&database_client, schema, "metrics").await?;
let src_dir = tempdir().context(FileIoSnafu)?;
let src_uri = Url::from_directory_path(src_dir.path())
.map_err(|_| {
InvalidArgumentsSnafu {
msg: "invalid temp dir path".to_string(),
}
.build()
})?
.to_string();
let mut export_args = vec![
"export-v2-create",
"--addr",
&addr,
"--to",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
];
if let Some(auth) = &auth_basic {
export_args.push("--auth-basic");
export_args.push(auth);
}
let export_cmd = ExportCreateCommand::parse_from(export_args);
export_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
let mut import_args = vec![
"import-v2",
"--addr",
&addr,
"--from",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
];
if let Some(auth) = &auth_basic {
import_args.push("--auth-basic");
import_args.push(auth);
}
let import_cmd = ImportV2Command::parse_from(import_args);
import_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
let actual_rows = query_count(&database_client, schema, "metrics").await?;
assert_eq!(actual_rows, expected_rows);
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
Ok(())
}
#[tokio::test]
#[ignore]
async fn import_v2_fails_on_incomplete_snapshot_e2e() -> Result<()> {
let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
let schema = "test_db_incomplete_snapshot";
let database_client = DatabaseClient::new(
addr.clone(),
catalog.clone(),
auth_basic.clone(),
Duration::from_secs(60),
None,
false,
);
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
database_client
.sql_in_public(&format!("CREATE DATABASE {schema}"))
.await?;
database_client
.sql(
"CREATE TABLE metrics (\
ts TIMESTAMP TIME INDEX, \
host STRING PRIMARY KEY, \
cpu DOUBLE \
) ENGINE=mito",
schema,
)
.await?;
database_client
.sql(
"INSERT INTO metrics (ts, host, cpu) VALUES ('2025-01-01T00:00:00Z', 'h1', 1.0)",
schema,
)
.await?;
let src_dir = tempdir().context(FileIoSnafu)?;
let src_uri = Url::from_directory_path(src_dir.path())
.map_err(|_| {
InvalidArgumentsSnafu {
msg: "invalid temp dir path".to_string(),
}
.build()
})?
.to_string();
let mut export_args = vec![
"export-v2-create",
"--addr",
&addr,
"--to",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
];
if let Some(auth) = &auth_basic {
export_args.push("--auth-basic");
export_args.push(auth);
}
let export_cmd = ExportCreateCommand::parse_from(export_args);
export_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
let storage_config = ObjectStoreConfig::default();
let storage = OpenDalStorage::from_uri(&src_uri, &storage_config)
.map_err(BoxedError::new)
.context(OtherSnafu)?;
let mut manifest = storage
.read_manifest()
.await
.map_err(BoxedError::new)
.context(OtherSnafu)?;
if let Some(first_chunk) = manifest.chunks.first_mut() {
first_chunk.status = ChunkStatus::Failed;
}
storage
.write_manifest(&manifest)
.await
.map_err(BoxedError::new)
.context(OtherSnafu)?;
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
let mut import_args = vec![
"import-v2",
"--addr",
&addr,
"--from",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
];
if let Some(auth) = &auth_basic {
import_args.push("--auth-basic");
import_args.push(auth);
}
let import_cmd = ImportV2Command::parse_from(import_args);
let err = import_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.expect_err("import should fail on incomplete snapshot");
assert!(err.to_string().contains("Incomplete snapshot"));
Ok(())
}
#[tokio::test]
#[ignore]
async fn import_v2_schema_filter_data_e2e() -> Result<()> {
let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
let schema_a = "test_db_filter_a";
let schema_b = "test_db_filter_b";
let database_client = DatabaseClient::new(
addr.clone(),
catalog.clone(),
auth_basic.clone(),
Duration::from_secs(60),
None,
false,
);
for schema in [schema_a, schema_b] {
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
database_client
.sql_in_public(&format!("CREATE DATABASE {schema}"))
.await?;
database_client
.sql(
"CREATE TABLE metrics (\
ts TIMESTAMP TIME INDEX, \
host STRING PRIMARY KEY, \
cpu DOUBLE \
) ENGINE=mito",
schema,
)
.await?;
}
database_client
.sql(
"INSERT INTO metrics (ts, host, cpu) VALUES ('2025-01-01T00:00:00Z', 'a1', 1.0)",
schema_a,
)
.await?;
database_client
.sql(
"INSERT INTO metrics (ts, host, cpu) VALUES ('2025-01-01T00:00:00Z', 'b1', 2.0)",
schema_b,
)
.await?;
let expected_rows_a = query_count(&database_client, schema_a, "metrics").await?;
let src_dir = tempdir().context(FileIoSnafu)?;
let src_uri = Url::from_directory_path(src_dir.path())
.map_err(|_| {
InvalidArgumentsSnafu {
msg: "invalid temp dir path".to_string(),
}
.build()
})?
.to_string();
let mut export_args = vec![
"export-v2-create",
"--addr",
&addr,
"--to",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema_a,
"--schemas",
schema_b,
];
if let Some(auth) = &auth_basic {
export_args.push("--auth-basic");
export_args.push(auth);
}
let export_cmd = ExportCreateCommand::parse_from(export_args);
export_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
for schema in [schema_a, schema_b] {
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
}
let mut import_args = vec![
"import-v2",
"--addr",
&addr,
"--from",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema_a,
];
if let Some(auth) = &auth_basic {
import_args.push("--auth-basic");
import_args.push(auth);
}
let import_cmd = ImportV2Command::parse_from(import_args);
import_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
let actual_rows_a = query_count(&database_client, schema_a, "metrics").await?;
assert_eq!(actual_rows_a, expected_rows_a);
let schema_b_query = database_client
.sql("SELECT COUNT(*) FROM metrics", schema_b)
.await;
assert!(schema_b_query.is_err(), "schema_b should not be imported");
for schema in [schema_a, schema_b] {
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
}
Ok(())
}
#[tokio::test]
#[ignore]
async fn export_import_v2_skipped_chunk_e2e() -> Result<()> {
let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
let schema = "test_db_skipped_chunk";
let database_client = DatabaseClient::new(
addr.clone(),
catalog.clone(),
auth_basic.clone(),
Duration::from_secs(60),
None,
false,
);
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
database_client
.sql_in_public(&format!("CREATE DATABASE {schema}"))
.await?;
database_client
.sql(
"CREATE TABLE metrics (\
ts TIMESTAMP TIME INDEX, \
host STRING PRIMARY KEY, \
cpu DOUBLE \
) ENGINE=mito",
schema,
)
.await?;
database_client
.sql(
"INSERT INTO metrics (ts, host, cpu) VALUES \
('2025-01-01T00:00:00Z', 'h1', 1.0), \
('2025-01-01T01:00:00Z', 'h2', 2.0)",
schema,
)
.await?;
let src_dir = tempdir().context(FileIoSnafu)?;
let src_uri = Url::from_directory_path(src_dir.path())
.map_err(|_| {
InvalidArgumentsSnafu {
msg: "invalid temp dir path".to_string(),
}
.build()
})?
.to_string();
let mut export_args = vec![
"export-v2-create",
"--addr",
&addr,
"--to",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
"--start-time",
"2025-01-01T00:00:00Z",
"--end-time",
"2025-01-01T02:00:00Z",
"--chunk-time-window",
"1h",
];
if let Some(auth) = &auth_basic {
export_args.push("--auth-basic");
export_args.push(auth);
}
let export_cmd = ExportCreateCommand::parse_from(export_args);
export_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
let storage_config = ObjectStoreConfig::default();
let storage = OpenDalStorage::from_uri(&src_uri, &storage_config)
.map_err(BoxedError::new)
.context(OtherSnafu)?;
let mut manifest = storage
.read_manifest()
.await
.map_err(BoxedError::new)
.context(OtherSnafu)?;
assert_eq!(manifest.chunks.len(), 2);
manifest.chunks[0].status = ChunkStatus::Skipped;
manifest.chunks[0].files.clear();
storage
.write_manifest(&manifest)
.await
.map_err(BoxedError::new)
.context(OtherSnafu)?;
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
let mut import_args = vec![
"import-v2",
"--addr",
&addr,
"--from",
&src_uri,
"--catalog",
&catalog,
"--schemas",
schema,
];
if let Some(auth) = &auth_basic {
import_args.push("--auth-basic");
import_args.push(auth);
}
let import_cmd = ImportV2Command::parse_from(import_args);
import_cmd
.build()
.await
.context(OtherSnafu)?
.do_work()
.await
.context(OtherSnafu)?;
let actual_rows = query_count(&database_client, schema, "metrics").await?;
assert_eq!(actual_rows, 1);
database_client
.sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
.await?;
Ok(())
}

View File

@@ -81,13 +81,16 @@ pub struct ImportCommand {
#[clap(long, value_parser = humantime::parse_duration)]
timeout: Option<Duration>,
/// The proxy server address to connect, if set, will override the system proxy.
/// The proxy server address to connect.
///
/// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set.
/// If set, it overrides the system proxy unless `--no-proxy` is specified.
/// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
#[clap(long)]
proxy: Option<String>,
/// Disable proxy server, if set, will not use any proxy.
/// Disable all proxy usage (ignores `--proxy` and system proxy).
///
/// When set and `--proxy` is not provided, this explicitly disables system proxy.
#[clap(long, default_value = "false")]
no_proxy: bool,
}
@@ -104,6 +107,7 @@ impl ImportCommand {
// Treats `None` as `0s` to disable server-side default timeout.
self.timeout.unwrap_or_default(),
proxy,
self.no_proxy,
);
Ok(Box::new(Import {
@@ -314,6 +318,7 @@ mod tests {
None,
Duration::from_secs(0),
None,
false,
),
input_dir: input_dir.to_string(),
parallelism: 1,

View File

@@ -0,0 +1,43 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Import V2 module.
//!
//! This module provides the V2 implementation of database import functionality,
//! featuring:
//! - DDL-based schema import
//! - Dry-run mode for verification
//!
//! # Example
//!
//! ```bash
//! # Dry-run import (verify without executing)
//! greptime cli data import-v2 \
//! --addr 127.0.0.1:4000 \
//! --from file:///tmp/snapshot \
//! --dry-run
//!
//! # Actual import
//! greptime cli data import-v2 \
//! --addr 127.0.0.1:4000 \
//! --from s3://bucket/snapshots/prod-20250101
//! ```
mod command;
pub(crate) mod coordinator;
pub mod error;
pub mod executor;
pub(crate) mod state;
pub use command::ImportV2Command;

View File

@@ -0,0 +1,974 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Import V2 CLI command.
use std::collections::HashSet;
use std::time::Duration;
use async_trait::async_trait;
use clap::Parser;
use common_error::ext::BoxedError;
use common_telemetry::info;
use snafu::{OptionExt, ResultExt};
use crate::Tool;
use crate::common::ObjectStoreConfig;
use crate::data::export_v2::data::{build_copy_source, execute_copy_database_from};
use crate::data::export_v2::manifest::{ChunkMeta, ChunkStatus, DataFormat, MANIFEST_VERSION};
use crate::data::import_v2::coordinator::{
ImportResumeConfig, ImportTaskExecutor, build_import_tasks, chunk_has_schema_files,
import_with_resume_session, prepare_import_resume,
};
use crate::data::import_v2::error::{
ChunkImportFailedSnafu, EmptyChunkManifestSnafu, ImportStatePathUnavailableSnafu,
IncompleteSnapshotSnafu, ManifestVersionMismatchSnafu, MissingChunkDataSnafu, Result,
SchemaNotInSnapshotSnafu, SnapshotStorageSnafu,
};
use crate::data::import_v2::executor::{DdlExecutor, DdlStatement};
use crate::data::import_v2::state::{ImportTaskKey, default_state_path};
use crate::data::path::{data_dir_for_schema_chunk, ddl_path_for_schema};
use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
use crate::database::{DatabaseClient, parse_proxy_opts};
/// Import from a snapshot.
#[derive(Debug, Parser)]
pub struct ImportV2Command {
/// Server address to connect (e.g., 127.0.0.1:4000).
#[clap(long)]
addr: String,
/// Source snapshot location (e.g., s3://bucket/path, file:///tmp/backup).
#[clap(long)]
from: String,
/// Target catalog name.
#[clap(long, default_value = "greptime")]
catalog: String,
/// Schema list to import (default: all in snapshot).
/// Can be specified multiple times or comma-separated.
#[clap(long, value_delimiter = ',')]
schemas: Vec<String>,
/// Verify without importing (dry-run).
#[clap(long)]
dry_run: bool,
/// Basic authentication (user:password).
#[clap(long)]
auth_basic: Option<String>,
/// Request timeout.
#[clap(long, value_parser = humantime::parse_duration)]
timeout: Option<Duration>,
/// Proxy server address.
///
/// If set, it overrides the system proxy unless `--no-proxy` is specified.
/// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
#[clap(long)]
proxy: Option<String>,
/// Disable all proxy usage (ignores `--proxy` and system proxy).
///
/// When set and `--proxy` is not provided, this explicitly disables system proxy.
#[clap(long)]
no_proxy: bool,
/// Object store configuration for remote storage backends.
#[clap(flatten)]
storage: ObjectStoreConfig,
}
impl ImportV2Command {
pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
// Validate URI format
validate_uri(&self.from)
.context(SnapshotStorageSnafu)
.map_err(BoxedError::new)?;
// Parse schemas (empty vec means all schemas)
let schemas = if self.schemas.is_empty() {
None
} else {
Some(self.schemas.clone())
};
// Build storage
let storage = OpenDalStorage::from_uri(&self.from, &self.storage)
.context(SnapshotStorageSnafu)
.map_err(BoxedError::new)?;
// Build database client
let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
let database_client = DatabaseClient::new(
self.addr.clone(),
self.catalog.clone(),
self.auth_basic.clone(),
self.timeout.unwrap_or(Duration::from_secs(60)),
proxy,
self.no_proxy,
);
Ok(Box::new(Import {
catalog: self.catalog.clone(),
schemas,
dry_run: self.dry_run,
snapshot_uri: self.from.clone(),
storage_config: self.storage.clone(),
storage: Box::new(storage),
database_client,
}))
}
}
/// Import tool implementation.
pub struct Import {
catalog: String,
schemas: Option<Vec<String>>,
dry_run: bool,
snapshot_uri: String,
storage_config: ObjectStoreConfig,
storage: Box<dyn SnapshotStorage>,
database_client: DatabaseClient,
}
#[async_trait]
impl Tool for Import {
async fn do_work(&self) -> std::result::Result<(), BoxedError> {
self.run().await.map_err(BoxedError::new)
}
}
impl Import {
async fn run(&self) -> Result<()> {
// 1. Read manifest
let manifest = self
.storage
.read_manifest()
.await
.context(SnapshotStorageSnafu)?;
info!(
"Loading snapshot: {} (version: {}, schema_only: {})",
manifest.snapshot_id, manifest.version, manifest.schema_only
);
// Check version compatibility
if manifest.version != MANIFEST_VERSION {
return ManifestVersionMismatchSnafu {
expected: MANIFEST_VERSION,
found: manifest.version,
}
.fail();
}
info!("Snapshot contains {} schema(s)", manifest.schemas.len());
// 2. Determine schemas to import
let schemas_to_import = match &self.schemas {
Some(filter) => canonicalize_schema_filter(filter, &manifest.schemas)?,
None => manifest.schemas.clone(),
};
info!("Importing schemas: {:?}", schemas_to_import);
// 3. Read DDL statements
let ddl_statements = self.read_ddl_statements(&schemas_to_import).await?;
info!("Generated {} DDL statements", ddl_statements.len());
let data_tasks = if !manifest.schema_only && !manifest.chunks.is_empty() {
validate_data_snapshot(self.storage.as_ref(), &manifest.chunks, &schemas_to_import)
.await?;
build_import_tasks(&manifest.chunks, &schemas_to_import)
} else {
Vec::new()
};
// 4. Dry-run mode: print DDL and exit
if self.dry_run {
info!("Dry-run mode - DDL statements to execute:");
println!();
for (i, stmt) in ddl_statements.iter().enumerate() {
println!("-- Statement {}", i + 1);
println!("{};", stmt.sql);
println!();
}
if !manifest.schema_only && !manifest.chunks.is_empty() {
for line in format_data_import_plan(&manifest.chunks, &schemas_to_import) {
println!("{line}");
}
println!();
}
return Ok(());
}
let mut resume_session = if !data_tasks.is_empty() {
let state_path = default_state_path(
&manifest.snapshot_id.to_string(),
self.database_client.addr(),
&self.catalog,
&schemas_to_import,
)
.context(ImportStatePathUnavailableSnafu {
snapshot_id: manifest.snapshot_id.to_string(),
})?;
Some(
prepare_import_resume(ImportResumeConfig {
snapshot_id: manifest.snapshot_id.to_string(),
target_addr: self.database_client.addr().to_string(),
catalog: self.catalog.clone(),
schemas: schemas_to_import.clone(),
state_path,
tasks: data_tasks,
})
.await?,
)
} else {
None
};
let skip_ddl = resume_session
.as_ref()
.map(|session| session.should_skip_ddl())
.unwrap_or(false);
// 5. Execute DDL unless a previous run already completed it.
let ddl_executed = if skip_ddl {
info!(
"Existing import state has DDL marked completed; skipping DDL execution and resuming data import"
);
false
} else {
let executor = DdlExecutor::new(&self.database_client);
executor.execute_strict(&ddl_statements).await?;
if let Some(session) = resume_session.as_mut() {
session.mark_ddl_completed().await?;
}
true
};
if let Some(resume_session) = resume_session {
let executor = CopyDatabaseImportTaskExecutor {
import: self,
format: manifest.format,
};
import_with_resume_session(resume_session, &executor).await?;
}
if ddl_executed {
info!(
"Import completed: {} DDL statements executed",
ddl_statements.len()
);
} else {
info!("Import completed: DDL execution skipped");
}
Ok(())
}
async fn read_ddl_statements(&self, schemas: &[String]) -> Result<Vec<DdlStatement>> {
let mut statements = Vec::new();
for schema in schemas {
let path = ddl_path_for_schema(schema);
let content = self
.storage
.read_text(&path)
.await
.context(SnapshotStorageSnafu)?;
statements.extend(
parse_ddl_statements(&content)
.into_iter()
.map(|sql| ddl_statement_for_schema(schema, sql)),
);
}
Ok(statements)
}
}
struct CopyDatabaseImportTaskExecutor<'a> {
import: &'a Import,
format: DataFormat,
}
#[async_trait]
impl ImportTaskExecutor for CopyDatabaseImportTaskExecutor<'_> {
async fn import_task(&self, task: &ImportTaskKey) -> Result<()> {
let source = build_copy_source(
&self.import.snapshot_uri,
&self.import.storage_config,
&task.schema,
task.chunk_id,
)
.context(ChunkImportFailedSnafu {
chunk_id: task.chunk_id,
schema: task.schema.clone(),
})?;
execute_copy_database_from(
&self.import.database_client,
&self.import.catalog,
&task.schema,
&source,
self.format,
)
.await
.context(ChunkImportFailedSnafu {
chunk_id: task.chunk_id,
schema: task.schema.clone(),
})
}
}
fn parse_ddl_statements(content: &str) -> Vec<String> {
let mut statements = Vec::new();
let mut current = String::new();
let mut chars = content.chars().peekable();
let mut in_single_quote = false;
let mut in_double_quote = false;
let mut in_line_comment = false;
let mut in_block_comment = false;
while let Some(ch) = chars.next() {
if in_line_comment {
if ch == '\n' {
in_line_comment = false;
current.push('\n');
}
continue;
}
if in_block_comment {
if ch == '*' && chars.peek() == Some(&'/') {
chars.next();
in_block_comment = false;
}
continue;
}
if in_single_quote {
current.push(ch);
if ch == '\'' {
if chars.peek() == Some(&'\'') {
current.push(chars.next().expect("peeked quote must exist"));
} else {
in_single_quote = false;
}
}
continue;
}
if in_double_quote {
current.push(ch);
if ch == '"' {
if chars.peek() == Some(&'"') {
current.push(chars.next().expect("peeked quote must exist"));
} else {
in_double_quote = false;
}
}
continue;
}
match ch {
'-' if chars.peek() == Some(&'-') => {
chars.next();
in_line_comment = true;
}
'/' if chars.peek() == Some(&'*') => {
chars.next();
in_block_comment = true;
}
'\'' => {
in_single_quote = true;
current.push(ch);
}
'"' => {
in_double_quote = true;
current.push(ch);
}
';' => {
let statement = current.trim();
if !statement.is_empty() {
statements.push(statement.to_string());
}
current.clear();
}
_ => current.push(ch),
}
}
let statement = current.trim();
if !statement.is_empty() {
statements.push(statement.to_string());
}
statements
}
fn ddl_statement_for_schema(schema: &str, sql: String) -> DdlStatement {
if is_schema_scoped_statement(&sql) {
DdlStatement::with_execution_schema(sql, schema.to_string())
} else {
DdlStatement::new(sql)
}
}
fn is_schema_scoped_statement(sql: &str) -> bool {
let trimmed = sql.trim_start();
if !starts_with_keyword(trimmed, "CREATE") {
return false;
}
let Some(rest) = trimmed.get("CREATE".len()..) else {
return false;
};
let mut rest = rest.trim_start();
if starts_with_keyword(rest, "OR") {
let Some(next) = rest.get("OR".len()..) else {
return false;
};
rest = next.trim_start();
if !starts_with_keyword(rest, "REPLACE") {
return false;
}
let Some(next) = rest.get("REPLACE".len()..) else {
return false;
};
rest = next.trim_start();
}
if starts_with_keyword(rest, "EXTERNAL") {
let Some(next) = rest.get("EXTERNAL".len()..) else {
return false;
};
rest = next.trim_start();
}
starts_with_keyword(rest, "TABLE") || starts_with_keyword(rest, "VIEW")
}
fn starts_with_keyword(input: &str, keyword: &str) -> bool {
input
.get(0..keyword.len())
.map(|s| s.eq_ignore_ascii_case(keyword))
.unwrap_or(false)
&& input
.as_bytes()
.get(keyword.len())
.map(|b| !b.is_ascii_alphanumeric() && *b != b'_')
.unwrap_or(true)
}
fn canonicalize_schema_filter(
filter: &[String],
manifest_schemas: &[String],
) -> Result<Vec<String>> {
let mut canonicalized = Vec::new();
let mut seen = HashSet::new();
for schema in filter {
let canonical = manifest_schemas
.iter()
.find(|candidate| candidate.eq_ignore_ascii_case(schema))
.cloned()
.ok_or_else(|| {
SchemaNotInSnapshotSnafu {
schema: schema.clone(),
}
.build()
})?;
if seen.insert(canonical.to_ascii_lowercase()) {
canonicalized.push(canonical);
}
}
Ok(canonicalized)
}
fn validate_chunk_statuses(chunks: &[ChunkMeta]) -> Result<()> {
let invalid_chunk = chunks
.iter()
.find(|chunk| !matches!(chunk.status, ChunkStatus::Completed | ChunkStatus::Skipped));
if let Some(chunk) = invalid_chunk {
return IncompleteSnapshotSnafu {
chunk_id: chunk.id,
status: chunk.status,
}
.fail();
}
Ok(())
}
fn format_data_import_plan(chunks: &[ChunkMeta], schemas: &[String]) -> Vec<String> {
let mut lines = vec!["-- Data import plan:".to_string()];
for chunk in chunks {
lines.push(format!("-- Chunk {}: {:?}", chunk.id, chunk.status));
for schema in schemas {
if chunk_has_schema_files(chunk, schema) {
lines.push(format!("-- {} -> COPY DATABASE FROM", schema));
}
}
}
lines
}
async fn validate_data_snapshot(
storage: &dyn SnapshotStorage,
chunks: &[ChunkMeta],
schemas: &[String],
) -> Result<()> {
validate_chunk_statuses(chunks)?;
let actual_prefixes = collect_chunk_data_prefixes(storage).await?;
for chunk in chunks {
if chunk.status == ChunkStatus::Skipped {
continue;
}
if chunk.files.is_empty() {
return EmptyChunkManifestSnafu { chunk_id: chunk.id }.fail();
}
for schema in schemas {
validate_chunk_schema_files(chunk, schema, &actual_prefixes)?;
}
}
Ok(())
}
async fn collect_chunk_data_prefixes(storage: &dyn SnapshotStorage) -> Result<HashSet<String>> {
let files = storage
.list_files_recursive("data/")
.await
.context(SnapshotStorageSnafu)?;
let mut prefixes = HashSet::new();
for path in files {
let normalized = path.trim_start_matches('/');
let mut parts = normalized.splitn(4, '/');
let Some(root) = parts.next() else {
continue;
};
let Some(schema) = parts.next() else {
continue;
};
let Some(chunk_id) = parts.next() else {
continue;
};
if root != "data" {
continue;
}
prefixes.insert(format!("data/{schema}/{chunk_id}/"));
}
Ok(prefixes)
}
fn validate_chunk_schema_files(
chunk: &ChunkMeta,
schema: &str,
actual_prefixes: &HashSet<String>,
) -> Result<bool> {
if !chunk_has_schema_files(chunk, schema) {
return Ok(false);
}
let prefix = data_dir_for_schema_chunk(schema, chunk.id);
if !actual_prefixes.contains(&prefix) {
return MissingChunkDataSnafu {
chunk_id: chunk.id,
schema: schema.to_string(),
path: prefix,
}
.fail();
}
Ok(true)
}
#[cfg(test)]
mod tests {
use std::collections::{HashMap, HashSet};
use async_trait::async_trait;
use super::*;
use crate::data::export_v2::manifest::{ChunkMeta, ChunkStatus, Manifest, TimeRange};
use crate::data::export_v2::schema::SchemaSnapshot;
use crate::data::snapshot_storage::SnapshotStorage;
struct StubStorage {
manifest: Manifest,
files_by_prefix: HashMap<String, Vec<String>>,
}
#[async_trait]
impl SnapshotStorage for StubStorage {
async fn exists(&self) -> crate::data::export_v2::error::Result<bool> {
Ok(true)
}
async fn read_manifest(&self) -> crate::data::export_v2::error::Result<Manifest> {
Ok(self.manifest.clone())
}
async fn write_manifest(
&self,
_manifest: &Manifest,
) -> crate::data::export_v2::error::Result<()> {
unimplemented!("not needed in import_v2::command tests")
}
async fn read_text(&self, _path: &str) -> crate::data::export_v2::error::Result<String> {
unimplemented!("not needed in import_v2::command tests")
}
async fn write_text(
&self,
_path: &str,
_content: &str,
) -> crate::data::export_v2::error::Result<()> {
unimplemented!("not needed in import_v2::command tests")
}
async fn write_schema(
&self,
_snapshot: &SchemaSnapshot,
) -> crate::data::export_v2::error::Result<()> {
unimplemented!("not needed in import_v2::command tests")
}
async fn create_dir_all(&self, _path: &str) -> crate::data::export_v2::error::Result<()> {
unimplemented!("not needed in import_v2::command tests")
}
async fn list_files_recursive(
&self,
prefix: &str,
) -> crate::data::export_v2::error::Result<Vec<String>> {
Ok(self
.files_by_prefix
.iter()
.filter(|(candidate, _)| candidate.starts_with(prefix))
.flat_map(|(_, files)| files.clone())
.collect())
}
async fn delete_snapshot(&self) -> crate::data::export_v2::error::Result<()> {
unimplemented!("not needed in import_v2::command tests")
}
}
#[test]
fn test_parse_ddl_statements() {
let content = r#"
-- Schema: public
CREATE DATABASE public;
CREATE TABLE t (ts TIMESTAMP TIME INDEX, host STRING, PRIMARY KEY (host)) ENGINE=mito;
-- comment
CREATE VIEW v AS SELECT * FROM t;
"#;
let statements = parse_ddl_statements(content);
assert_eq!(statements.len(), 3);
assert!(statements[0].starts_with("CREATE DATABASE public"));
assert!(statements[1].starts_with("CREATE TABLE t"));
assert!(statements[2].starts_with("CREATE VIEW v"));
}
#[test]
fn test_parse_ddl_statements_preserves_semicolons_in_string_literals() {
let content = r#"
CREATE TABLE t (
host STRING DEFAULT 'a;b'
);
CREATE VIEW v AS SELECT ';' AS marker;
"#;
let statements = parse_ddl_statements(content);
assert_eq!(statements.len(), 2);
assert!(statements[0].contains("'a;b'"));
assert!(statements[1].contains("';' AS marker"));
}
#[test]
fn test_parse_ddl_statements_handles_comments_without_splitting() {
let content = r#"
-- leading comment
CREATE TABLE t (ts TIMESTAMP TIME INDEX); /* block; comment */
CREATE VIEW v AS SELECT 1;
"#;
let statements = parse_ddl_statements(content);
assert_eq!(statements.len(), 2);
assert!(statements[0].starts_with("CREATE TABLE t"));
assert!(statements[1].starts_with("CREATE VIEW v"));
}
#[test]
fn test_canonicalize_schema_filter_uses_manifest_casing() {
let filter = vec!["TEST_DB".to_string(), "PUBLIC".to_string()];
let manifest_schemas = vec!["test_db".to_string(), "public".to_string()];
let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap();
assert_eq!(canonicalized, vec!["test_db", "public"]);
}
#[test]
fn test_canonicalize_schema_filter_dedupes_case_insensitive_matches() {
let filter = vec![
"TEST_DB".to_string(),
"test_db".to_string(),
"PUBLIC".to_string(),
"public".to_string(),
];
let manifest_schemas = vec!["test_db".to_string(), "public".to_string()];
let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap();
assert_eq!(canonicalized, vec!["test_db", "public"]);
}
#[test]
fn test_canonicalize_schema_filter_rejects_missing_schema() {
let filter = vec!["missing".to_string()];
let manifest_schemas = vec!["test_db".to_string()];
let error = canonicalize_schema_filter(&filter, &manifest_schemas)
.expect_err("missing schema should fail")
.to_string();
assert!(error.contains("missing"));
}
#[test]
fn test_ddl_statement_for_schema_create_table_uses_execution_schema() {
let stmt = ddl_statement_for_schema(
"test_db",
"CREATE TABLE metrics (ts TIMESTAMP TIME INDEX) ENGINE=mito".to_string(),
);
assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
}
#[test]
fn test_ddl_statement_for_schema_create_view_uses_execution_schema() {
let stmt = ddl_statement_for_schema(
"test_db",
"CREATE VIEW metrics_view AS SELECT * FROM metrics".to_string(),
);
assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
}
#[test]
fn test_ddl_statement_for_schema_create_or_replace_view_uses_execution_schema() {
let stmt = ddl_statement_for_schema(
"test_db",
"CREATE OR REPLACE VIEW metrics_view AS SELECT * FROM metrics".to_string(),
);
assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
}
#[test]
fn test_ddl_statement_for_schema_create_external_table_uses_execution_schema() {
let stmt = ddl_statement_for_schema(
"test_db",
"CREATE EXTERNAL TABLE IF NOT EXISTS ext_metrics (ts TIMESTAMP TIME INDEX) ENGINE=file"
.to_string(),
);
assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
}
#[test]
fn test_ddl_statement_for_schema_create_database_uses_public_context() {
let stmt = ddl_statement_for_schema("test_db", "CREATE DATABASE test_db".to_string());
assert_eq!(stmt.execution_schema, None);
}
#[test]
fn test_starts_with_keyword_requires_word_boundary() {
assert!(starts_with_keyword("CREATE TABLE t", "CREATE"));
assert!(!starts_with_keyword("CREATED TABLE t", "CREATE"));
assert!(!starts_with_keyword("TABLESPACE foo", "TABLE"));
}
#[test]
fn test_validate_chunk_statuses_rejects_failed_chunk() {
let mut failed = ChunkMeta::new(3, TimeRange::unbounded());
failed.status = ChunkStatus::Failed;
let error = validate_chunk_statuses(&[failed]).expect_err("failed chunk should error");
assert!(error.to_string().contains("Incomplete snapshot"));
}
#[test]
fn test_validate_chunk_statuses_accepts_completed_and_skipped_chunks() {
let mut completed = ChunkMeta::new(1, TimeRange::unbounded());
completed.status = ChunkStatus::Completed;
let skipped = ChunkMeta::skipped(2, TimeRange::unbounded());
assert!(validate_chunk_statuses(&[completed, skipped]).is_ok());
}
#[test]
fn test_chunk_has_schema_files_matches_encoded_schema_prefix() {
let mut chunk = ChunkMeta::new(7, TimeRange::unbounded());
chunk.files = vec![
"data/public/7/a.parquet".to_string(),
"data/%E6%B5%8B%E8%AF%95/7/b.parquet".to_string(),
];
assert!(chunk_has_schema_files(&chunk, "public"));
assert!(chunk_has_schema_files(&chunk, "测试"));
assert!(!chunk_has_schema_files(&chunk, "metrics"));
}
#[test]
fn test_format_data_import_plan_includes_matching_schemas_only() {
let mut completed = ChunkMeta::new(1, TimeRange::unbounded());
completed.status = ChunkStatus::Completed;
completed.files = vec![
"data/public/1/a.parquet".to_string(),
"data/%E6%B5%8B%E8%AF%95/1/b.parquet".to_string(),
];
let skipped = ChunkMeta::skipped(2, TimeRange::unbounded());
let lines = format_data_import_plan(
&[completed, skipped],
&[
"public".to_string(),
"测试".to_string(),
"metrics".to_string(),
],
);
assert_eq!(lines[0], "-- Data import plan:");
assert!(lines.contains(&"-- Chunk 1: Completed".to_string()));
assert!(lines.contains(&"-- public -> COPY DATABASE FROM".to_string()));
assert!(lines.contains(&"-- 测试 -> COPY DATABASE FROM".to_string()));
assert!(!lines.contains(&"-- metrics -> COPY DATABASE FROM".to_string()));
assert!(lines.contains(&"-- Chunk 2: Skipped".to_string()));
}
#[tokio::test]
async fn test_collect_chunk_data_prefixes_indexes_present_prefixes() {
let storage = StubStorage {
manifest: Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]),
files_by_prefix: HashMap::from([
(
"data/public/7/".to_string(),
vec!["data/public/7/a.parquet".to_string()],
),
(
"data/%E6%B5%8B%E8%AF%95/9/".to_string(),
vec!["data/%E6%B5%8B%E8%AF%95/9/b.parquet".to_string()],
),
]),
};
let prefixes = collect_chunk_data_prefixes(&storage).await.unwrap();
assert!(prefixes.contains("data/public/7/"));
assert!(prefixes.contains("data/%E6%B5%8B%E8%AF%95/9/"));
}
#[test]
fn test_validate_chunk_schema_files_accepts_present_prefix() {
let mut chunk = ChunkMeta::new(7, TimeRange::unbounded());
chunk.files = vec!["data/public/7/a.parquet".to_string()];
let actual_prefixes = HashSet::from(["data/public/7/".to_string()]);
assert!(validate_chunk_schema_files(&chunk, "public", &actual_prefixes).unwrap());
}
#[test]
fn test_validate_chunk_schema_files_rejects_missing_prefix() {
let mut chunk = ChunkMeta::new(7, TimeRange::unbounded());
chunk.files = vec!["data/public/7/a.parquet".to_string()];
let error = validate_chunk_schema_files(&chunk, "public", &HashSet::new())
.expect_err("missing chunk prefix should fail")
.to_string();
assert!(error.contains("marked completed but no files were found"));
}
#[test]
fn test_validate_chunk_schema_files_skips_absent_schema() {
let mut chunk = ChunkMeta::new(7, TimeRange::unbounded());
chunk.files = vec!["data/public/7/a.parquet".to_string()];
assert!(!validate_chunk_schema_files(&chunk, "metrics", &HashSet::new()).unwrap());
}
#[tokio::test]
async fn test_validate_data_snapshot_rejects_failed_chunk_before_dry_run() {
let mut failed = ChunkMeta::new(3, TimeRange::unbounded());
failed.status = ChunkStatus::Failed;
let storage = StubStorage {
manifest: Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]),
files_by_prefix: HashMap::new(),
};
let error = validate_data_snapshot(&storage, &[failed], &["public".to_string()])
.await
.expect_err("failed chunk should reject dry-run validation")
.to_string();
assert!(error.contains("Incomplete snapshot"));
}
#[tokio::test]
async fn test_validate_data_snapshot_rejects_missing_chunk_prefix_before_dry_run() {
let mut completed = ChunkMeta::new(7, TimeRange::unbounded());
completed.status = ChunkStatus::Completed;
completed.files = vec!["data/public/7/a.parquet".to_string()];
let storage = StubStorage {
manifest: Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]),
files_by_prefix: HashMap::new(),
};
let error = validate_data_snapshot(&storage, &[completed], &["public".to_string()])
.await
.expect_err("missing chunk prefix should reject dry-run validation")
.to_string();
assert!(error.contains("marked completed but no files were found"));
}
#[tokio::test]
async fn test_validate_data_snapshot_rejects_completed_chunk_with_empty_manifest() {
let mut completed = ChunkMeta::new(7, TimeRange::unbounded());
completed.status = ChunkStatus::Completed;
let storage = StubStorage {
manifest: Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]),
files_by_prefix: HashMap::new(),
};
let error = validate_data_snapshot(&storage, &[completed], &["public".to_string()])
.await
.expect_err("empty completed chunk should reject validation")
.to_string();
assert!(error.contains("file manifest is empty"));
}
}

View File

@@ -0,0 +1,695 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::BTreeSet;
use std::path::{Path, PathBuf};
use std::time::Instant;
use async_trait::async_trait;
use common_telemetry::{info, warn};
use crate::data::export_v2::manifest::{ChunkMeta, ChunkStatus};
use crate::data::import_v2::error::{
ImportStateDdlIncompleteSnafu, ImportStateMismatchSnafu, Result,
};
use crate::data::import_v2::state::{
ImportState, ImportStateLockGuard, ImportTaskKey, ImportTaskStatus, canonical_schema_selection,
delete_import_state, load_import_state, save_import_state, try_acquire_import_state_lock,
};
use crate::data::path::data_dir_for_schema_chunk;
#[async_trait]
pub(crate) trait ImportTaskExecutor {
async fn import_task(&self, task: &ImportTaskKey) -> Result<()>;
}
pub(crate) struct ImportResumeConfig {
pub(crate) snapshot_id: String,
pub(crate) target_addr: String,
pub(crate) catalog: String,
pub(crate) schemas: Vec<String>,
pub(crate) state_path: PathBuf,
pub(crate) tasks: Vec<ImportTaskKey>,
}
pub(crate) struct ImportResumeSession {
config: ImportResumeConfig,
state: ImportState,
lock: ImportStateLockGuard,
}
impl ImportResumeSession {
pub(crate) fn should_skip_ddl(&self) -> bool {
self.state.ddl_completed
}
/// Marks DDL as completed and persists the state. Must be called after a
/// successful DDL run on a fresh session, so that crashes after this point
/// resume into the data-import phase instead of replaying DDL.
pub(crate) async fn mark_ddl_completed(&mut self) -> Result<()> {
self.state.mark_ddl_completed();
save_import_state(&self.config.state_path, &self.state).await
}
}
pub(crate) fn chunk_has_schema_files(chunk: &ChunkMeta, schema: &str) -> bool {
let prefix = data_dir_for_schema_chunk(schema, chunk.id);
chunk.files.iter().any(|path| {
let normalized = path.trim_start_matches('/');
normalized.starts_with(&prefix)
})
}
pub(crate) fn build_import_tasks(chunks: &[ChunkMeta], schemas: &[String]) -> Vec<ImportTaskKey> {
let mut tasks = Vec::new();
for chunk in chunks {
if chunk.status == ChunkStatus::Skipped {
continue;
}
// TODO: build a per-chunk schema index if chunk file manifests become large.
for schema in schemas {
if chunk_has_schema_files(chunk, schema) {
tasks.push(ImportTaskKey::new(chunk.id, schema.clone()));
}
}
}
tasks
}
pub(crate) async fn prepare_import_resume(
config: ImportResumeConfig,
) -> Result<ImportResumeSession> {
// Validate the request before touching the state file or acquiring the
// lock. Duplicate task keys would corrupt the resume bookkeeping because
// status lookups use linear `find()` and only ever see the first match.
validate_config_tasks(&config)?;
let lock = try_acquire_import_state_lock(&config.state_path)?;
let state = match load_import_state(&config.state_path).await? {
Some(loaded) => {
validate_state_matches(&loaded, &config)?;
loaded
}
None => {
// Persist a fresh state immediately so that any crash after this
// point is recoverable as a resume. `ddl_completed=false` on a
// loaded state therefore means a previous run reached this point
// but did not confirm DDL completion - DDL must be (re-)run before
// data import is allowed.
let fresh = ImportState::new(
&config.snapshot_id,
&config.target_addr,
&config.catalog,
&config.schemas,
config.tasks.clone(),
);
save_import_state(&config.state_path, &fresh).await?;
fresh
}
};
Ok(ImportResumeSession {
config,
state,
lock,
})
}
pub(crate) async fn import_with_resume_session<E>(
session: ImportResumeSession,
executor: &E,
) -> Result<()>
where
E: ImportTaskExecutor + Sync,
{
let ImportResumeSession {
config,
mut state,
lock,
} = session;
// The state machine requires DDL to be explicitly marked completed before
// data import; otherwise a caller could import data and leave a state that
// replays DDL on the next resume. Surface the misuse instead of silently
// importing.
if !state.ddl_completed {
return ImportStateDdlIncompleteSnafu {
path: config.state_path.display().to_string(),
}
.fail();
}
let completed = state
.tasks
.iter()
.filter(|task| task.status == ImportTaskStatus::Completed)
.count();
info!(
"Import resume state: {} completed, {} pending, path: {}",
completed,
state.tasks.len().saturating_sub(completed),
config.state_path.display()
);
let import_start = Instant::now();
for (idx, task) in config.tasks.iter().enumerate() {
if state.task_status(task.chunk_id, &task.schema) == Some(ImportTaskStatus::Completed) {
info!(
"[{}/{}] Chunk {} schema {}: already completed, skipped",
idx + 1,
config.tasks.len(),
task.chunk_id,
task.schema
);
continue;
}
info!(
"[{}/{}] Chunk {} schema {}: importing...",
idx + 1,
config.tasks.len(),
task.chunk_id,
task.schema
);
state.set_task_status(
task.chunk_id,
&task.schema,
ImportTaskStatus::InProgress,
None,
)?;
save_import_state(&config.state_path, &state).await?;
let task_start = Instant::now();
let result = executor.import_task(task).await;
match result {
Ok(()) => {
// The task itself succeeded. If we cannot persist the
// Completed marker, the next resume will replay it (potentially
// duplicating data depending on engine semantics), but we must
// not pretend the import as a whole failed - return the persist
// error so the operator notices, after logging the success.
update_status_and_save(
&config,
&mut state,
task,
ImportTaskStatus::Completed,
None,
)
.await?;
info!(
"[{}/{}] Chunk {} schema {}: done in {:?}",
idx + 1,
config.tasks.len(),
task.chunk_id,
task.schema,
task_start.elapsed()
);
}
Err(task_error) => {
// Persist Failed best-effort, but always surface the original
// task error to the caller. State persistence problems are
// logged so they are not silently lost.
if let Err(persist_error) = update_status_and_save(
&config,
&mut state,
task,
ImportTaskStatus::Failed,
Some(task_error.to_string()),
)
.await
{
warn!(
"Failed to persist Failed status for chunk {} schema {} after task error ({}); state file may be out of date: {}",
task.chunk_id, task.schema, task_error, persist_error
);
}
return Err(task_error);
}
}
}
delete_import_state(&config.state_path).await?;
info!("Data import finished in {:?}", import_start.elapsed());
drop(lock);
Ok(())
}
async fn update_status_and_save(
config: &ImportResumeConfig,
state: &mut ImportState,
task: &ImportTaskKey,
status: ImportTaskStatus,
error_message: Option<String>,
) -> Result<()> {
// set_task_status only fails if the task isn't in the state; that would
// indicate a logic bug since `task` came from the same config. Surface it
// instead of swallowing.
state.set_task_status(task.chunk_id, &task.schema, status, error_message)?;
save_import_state(&config.state_path, state).await
}
fn validate_state_matches(state: &ImportState, config: &ImportResumeConfig) -> Result<()> {
if state.snapshot_id != config.snapshot_id {
return state_mismatch(
config,
format!(
"snapshot_id differs (state: {}, requested: {})",
state.snapshot_id, config.snapshot_id
),
);
}
// Target addresses are compared literally; hostname normalization is left to the caller.
if state.target_addr != config.target_addr {
return state_mismatch(
config,
format!(
"target_addr differs (state: {}, requested: {})",
state.target_addr, config.target_addr
),
);
}
if state.catalog != config.catalog {
return state_mismatch(
config,
format!(
"catalog differs (state: {}, requested: {})",
state.catalog, config.catalog
),
);
}
let requested_schemas = canonical_schema_selection(&config.schemas);
if state.schemas != requested_schemas {
return state_mismatch(
config,
format!(
"schemas differ (state: {:?}, requested: {:?})",
state.schemas, requested_schemas
),
);
}
if task_set_from_state(state, &config.state_path)? != task_set_from_config(config)? {
return state_mismatch(config, "task set differs".to_string());
}
Ok(())
}
fn state_mismatch(config: &ImportResumeConfig, reason: String) -> Result<()> {
ImportStateMismatchSnafu {
path: config.state_path.display().to_string(),
reason,
}
.fail()
}
fn task_set_from_state<'a>(
state: &'a ImportState,
state_path: &Path,
) -> Result<BTreeSet<(u32, &'a str)>> {
let mut tasks = BTreeSet::new();
for task in &state.tasks {
if !tasks.insert((task.chunk_id, task.schema.as_str())) {
return ImportStateMismatchSnafu {
path: state_path.display().to_string(),
reason: format!(
"duplicate task key in state (chunk_id: {}, schema: {})",
task.chunk_id, task.schema
),
}
.fail();
}
}
Ok(tasks)
}
fn task_set_from_config(config: &ImportResumeConfig) -> Result<BTreeSet<(u32, &str)>> {
let mut tasks = BTreeSet::new();
for task in &config.tasks {
if !tasks.insert((task.chunk_id, task.schema.as_str())) {
return ImportStateMismatchSnafu {
path: config.state_path.display().to_string(),
reason: format!(
"duplicate task key in request (chunk_id: {}, schema: {})",
task.chunk_id, task.schema
),
}
.fail();
}
}
Ok(tasks)
}
fn validate_config_tasks(config: &ImportResumeConfig) -> Result<()> {
task_set_from_config(config).map(|_| ())
}
#[cfg(test)]
mod tests {
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
use super::*;
use crate::data::export_v2::manifest::{ChunkMeta, TimeRange};
use crate::data::import_v2::error::TestTaskFailedSnafu;
#[derive(Debug, Clone, Copy)]
enum FailureMode {
Fatal,
RetryableThenSuccess { failures: usize },
}
struct RecordingExecutor {
imported: Arc<Mutex<Vec<ImportTaskKey>>>,
fail_task: Option<ImportTaskKey>,
failure_mode: Option<FailureMode>,
attempts: Arc<AtomicUsize>,
}
#[async_trait]
impl ImportTaskExecutor for RecordingExecutor {
async fn import_task(&self, task: &ImportTaskKey) -> Result<()> {
let attempt = self.attempts.fetch_add(1, Ordering::SeqCst);
if self.fail_task.as_ref() == Some(task) {
match self.failure_mode {
Some(FailureMode::Fatal) => {
return TestTaskFailedSnafu {
message: "fatal failure".to_string(),
retryable: false,
}
.fail();
}
Some(FailureMode::RetryableThenSuccess { failures }) if attempt < failures => {
return TestTaskFailedSnafu {
message: "retryable failure".to_string(),
retryable: true,
}
.fail();
}
_ => {}
}
}
self.imported.lock().unwrap().push(task.clone());
Ok(())
}
}
fn recording_executor(imported: Arc<Mutex<Vec<ImportTaskKey>>>) -> RecordingExecutor {
RecordingExecutor {
imported,
fail_task: None,
failure_mode: None,
attempts: Arc::new(AtomicUsize::new(0)),
}
}
fn config(path: PathBuf, tasks: Vec<ImportTaskKey>) -> ImportResumeConfig {
ImportResumeConfig {
snapshot_id: "snapshot-1".to_string(),
target_addr: "127.0.0.1:4000".to_string(),
catalog: "greptime".to_string(),
schemas: vec!["public".to_string(), "analytics".to_string()],
state_path: path,
tasks,
}
}
async fn run_import_with_resume<E>(config: ImportResumeConfig, executor: &E) -> Result<()>
where
E: ImportTaskExecutor + Sync,
{
// Mirror the production caller: mark DDL completed for fresh sessions
// so the data-import guard is satisfied. Tests that want to exercise
// the unsafe path drive prepare/import directly.
let mut session = prepare_import_resume(config).await?;
if !session.should_skip_ddl() {
session.mark_ddl_completed().await?;
}
import_with_resume_session(session, executor).await
}
#[test]
fn test_build_import_tasks_skips_skipped_chunks_and_missing_schema_files() {
let mut completed = ChunkMeta::new(1, TimeRange::unbounded());
completed.status = ChunkStatus::Completed;
completed.files = vec!["data/public/1/file.parquet".to_string()];
let mut skipped = ChunkMeta::new(2, TimeRange::unbounded());
skipped.status = ChunkStatus::Skipped;
skipped.files = vec!["data/public/2/file.parquet".to_string()];
let tasks = build_import_tasks(
&[completed, skipped],
&["public".to_string(), "analytics".to_string()],
);
assert_eq!(tasks, vec![ImportTaskKey::new(1, "public")]);
}
#[tokio::test]
async fn test_import_with_resume_skips_completed_tasks() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("import_state.json");
let tasks = vec![
ImportTaskKey::new(1, "public"),
ImportTaskKey::new(2, "analytics"),
];
let mut state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&["public".to_string(), "analytics".to_string()],
tasks.clone(),
);
state.mark_ddl_completed();
state
.set_task_status(1, "public", ImportTaskStatus::Completed, None)
.unwrap();
save_import_state(&path, &state).await.unwrap();
let imported = Arc::new(Mutex::new(Vec::new()));
let executor = recording_executor(imported.clone());
run_import_with_resume(config(path.clone(), tasks), &executor)
.await
.unwrap();
assert_eq!(
imported.lock().unwrap().clone(),
vec![ImportTaskKey::new(2, "analytics")]
);
assert!(load_import_state(&path).await.unwrap().is_none());
}
#[tokio::test]
async fn test_import_with_resume_persists_failed_task() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("import_state.json");
let failed_task = ImportTaskKey::new(1, "public");
let tasks = vec![failed_task.clone()];
let imported = Arc::new(Mutex::new(Vec::new()));
let executor = RecordingExecutor {
imported,
fail_task: Some(failed_task.clone()),
failure_mode: Some(FailureMode::Fatal),
attempts: Arc::new(AtomicUsize::new(0)),
};
let error = run_import_with_resume(config(path.clone(), tasks), &executor)
.await
.unwrap_err();
assert!(matches!(
error,
crate::data::import_v2::error::Error::TestTaskFailed {
retryable: false,
..
}
));
let state = load_import_state(&path).await.unwrap().unwrap();
assert_eq!(
state.task_status(failed_task.chunk_id, &failed_task.schema),
Some(ImportTaskStatus::Failed)
);
}
#[tokio::test]
async fn test_import_with_resume_rejects_mismatched_state_identity() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("import_state.json");
let tasks = vec![ImportTaskKey::new(1, "public")];
let state = ImportState::new(
"snapshot-1",
"127.0.0.1:4001",
"greptime",
&["public".to_string(), "analytics".to_string()],
tasks.clone(),
);
save_import_state(&path, &state).await.unwrap();
let imported = Arc::new(Mutex::new(Vec::new()));
let executor = recording_executor(imported);
let error = run_import_with_resume(config(path, tasks), &executor)
.await
.unwrap_err();
assert!(matches!(
error,
crate::data::import_v2::error::Error::ImportStateMismatch { .. }
));
}
#[tokio::test]
async fn test_prepare_import_resume_reports_existing_state_before_ddl() {
let dir = tempfile::tempdir().unwrap();
let tasks = vec![ImportTaskKey::new(1, "public")];
let fresh_session =
prepare_import_resume(config(dir.path().join("fresh_state.json"), tasks.clone()))
.await
.unwrap();
assert!(!fresh_session.should_skip_ddl());
drop(fresh_session);
let existing_path = dir.path().join("existing_state.json");
let mut state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&["public".to_string(), "analytics".to_string()],
tasks.clone(),
);
state.mark_ddl_completed();
save_import_state(&existing_path, &state).await.unwrap();
let resume_session = prepare_import_resume(config(existing_path, tasks))
.await
.unwrap();
assert!(resume_session.should_skip_ddl());
}
#[tokio::test]
async fn test_import_with_resume_rejects_duplicate_state_tasks() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("import_state.json");
let tasks = vec![ImportTaskKey::new(1, "public")];
let mut state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&["public".to_string(), "analytics".to_string()],
tasks.clone(),
);
state.tasks.push(state.tasks[0].clone());
save_import_state(&path, &state).await.unwrap();
let imported = Arc::new(Mutex::new(Vec::new()));
let executor = recording_executor(imported);
let error = run_import_with_resume(config(path, tasks), &executor)
.await
.unwrap_err();
assert!(matches!(
error,
crate::data::import_v2::error::Error::ImportStateMismatch { .. }
));
}
#[tokio::test]
async fn test_import_with_resume_rejects_data_import_when_ddl_incomplete() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("import_state.json");
let tasks = vec![ImportTaskKey::new(1, "public")];
// prepare creates fresh state with ddl_completed=false; calling
// import_with_resume_session directly (without mark_ddl_completed)
// must be rejected.
let session = prepare_import_resume(config(path, tasks)).await.unwrap();
let imported = Arc::new(Mutex::new(Vec::new()));
let executor = recording_executor(imported.clone());
let error = import_with_resume_session(session, &executor)
.await
.unwrap_err();
assert!(matches!(
error,
crate::data::import_v2::error::Error::ImportStateDdlIncomplete { .. }
));
assert!(imported.lock().unwrap().is_empty());
}
#[tokio::test]
async fn test_prepare_import_resume_rejects_duplicate_request_tasks_on_fresh_state() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("import_state.json");
let task = ImportTaskKey::new(1, "public");
// No state file yet - duplicate detection must run before the fresh
// state is persisted, otherwise corrupted bookkeeping would be
// written to disk and observed only on a later resume.
let error =
match prepare_import_resume(config(path.clone(), vec![task.clone(), task])).await {
Ok(_) => panic!("duplicate request tasks should be rejected"),
Err(error) => error,
};
assert!(matches!(
error,
crate::data::import_v2::error::Error::ImportStateMismatch { .. }
));
assert!(load_import_state(&path).await.unwrap().is_none());
}
#[tokio::test]
async fn test_import_with_resume_does_not_retry_retryable_task_error() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("import_state.json");
let failed_task = ImportTaskKey::new(1, "public");
let tasks = vec![failed_task.clone()];
let imported = Arc::new(Mutex::new(Vec::new()));
let attempts = Arc::new(AtomicUsize::new(0));
let executor = RecordingExecutor {
imported: imported.clone(),
fail_task: Some(failed_task.clone()),
// If task import were retried, the second attempt would succeed.
// COPY DATABASE FROM failures are ambiguous, so retryable errors
// must still stop immediately to avoid duplicate rows.
failure_mode: Some(FailureMode::RetryableThenSuccess { failures: 1 }),
attempts: attempts.clone(),
};
let error = run_import_with_resume(config(path.clone(), tasks), &executor)
.await
.unwrap_err();
assert!(matches!(
error,
crate::data::import_v2::error::Error::TestTaskFailed {
retryable: true,
..
}
));
assert_eq!(attempts.load(Ordering::SeqCst), 1);
assert!(imported.lock().unwrap().is_empty());
let state = load_import_state(&path).await.unwrap().unwrap();
assert_eq!(
state.task_status(failed_task.chunk_id, &failed_task.schema),
Some(ImportTaskStatus::Failed)
);
}
}

View File

@@ -0,0 +1,222 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use snafu::{Location, Snafu};
use crate::data::export_v2::manifest::ChunkStatus;
#[derive(Snafu)]
#[snafu(visibility(pub))]
#[stack_trace_debug]
pub enum Error {
#[snafu(display("Snapshot not found at '{}'", uri))]
SnapshotNotFound {
uri: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))]
ManifestVersionMismatch {
expected: u32,
found: u32,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Schema '{}' not found in snapshot", schema))]
SchemaNotInSnapshot {
schema: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Incomplete snapshot: chunk {} has status {:?}", chunk_id, status))]
IncompleteSnapshot {
chunk_id: u32,
status: ChunkStatus,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Snapshot is inconsistent: chunk {} is marked completed but its file manifest is empty",
chunk_id
))]
EmptyChunkManifest {
chunk_id: u32,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Snapshot is inconsistent: chunk {} for schema '{}' is marked completed but no files were found under '{}'",
chunk_id,
schema,
path
))]
MissingChunkData {
chunk_id: u32,
schema: String,
path: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Chunk {} import failed for schema '{}'", chunk_id, schema))]
ChunkImportFailed {
chunk_id: u32,
schema: String,
#[snafu(source)]
error: crate::data::export_v2::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Snapshot storage error"))]
SnapshotStorage {
#[snafu(source)]
error: crate::data::export_v2::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Database error"))]
Database {
#[snafu(source)]
error: crate::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to parse import state file"))]
ImportStateParse {
#[snafu(source)]
error: serde_json::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Import state I/O failed at '{}': {}", path, error))]
ImportStateIo {
path: String,
#[snafu(source)]
error: std::io::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Import state is already locked at '{}'", path))]
ImportStateLocked {
path: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Failed to determine import state path for snapshot '{}'. Set HOME, USERPROFILE, or run from a valid current directory.",
snapshot_id
))]
ImportStatePathUnavailable {
snapshot_id: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Import state at '{}' does not match current import: {}. Either rerun with matching import arguments, or delete the state file to start over (DDL will be re-executed).",
path,
reason
))]
ImportStateMismatch {
path: String,
reason: String,
#[snafu(implicit)]
location: Location,
},
#[cfg(test)]
#[snafu(display("Test task failed: {}", message))]
TestTaskFailed {
message: String,
retryable: bool,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Import state references unknown task: chunk {}, schema '{}'",
chunk_id,
schema
))]
ImportStateUnknownTask {
chunk_id: u32,
schema: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Import state at '{}' is not ready for data import: DDL has not been marked completed",
path
))]
ImportStateDdlIncomplete {
path: String,
#[snafu(implicit)]
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
impl ErrorExt for Error {
fn status_code(&self) -> StatusCode {
match self {
Error::SnapshotNotFound { .. }
| Error::SchemaNotInSnapshot { .. }
| Error::ManifestVersionMismatch { .. }
| Error::IncompleteSnapshot { .. }
| Error::EmptyChunkManifest { .. }
| Error::MissingChunkData { .. } => StatusCode::InvalidArguments,
Error::ImportStatePathUnavailable { .. }
| Error::ImportStateUnknownTask { .. }
| Error::ImportStateDdlIncomplete { .. } => StatusCode::Unexpected,
Error::ImportStateMismatch { .. } => StatusCode::InvalidArguments,
#[cfg(test)]
Error::TestTaskFailed { retryable, .. } => {
if *retryable {
StatusCode::StorageUnavailable
} else {
StatusCode::InvalidArguments
}
}
Error::Database { error, .. } => error.status_code(),
Error::SnapshotStorage { error, .. } | Error::ChunkImportFailed { error, .. } => {
error.status_code()
}
Error::ImportStateParse { .. } => StatusCode::Internal,
Error::ImportStateIo { .. } => StatusCode::StorageUnavailable,
Error::ImportStateLocked { .. } => StatusCode::IllegalState,
}
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -0,0 +1,122 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! DDL execution for import.
use common_telemetry::info;
use snafu::ResultExt;
use crate::data::import_v2::error::{DatabaseSnafu, Result};
use crate::database::DatabaseClient;
/// A DDL statement with an explicit execution schema context.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DdlStatement {
pub sql: String,
pub execution_schema: Option<String>,
}
impl DdlStatement {
pub fn new(sql: String) -> Self {
Self {
sql,
execution_schema: None,
}
}
pub fn with_execution_schema(sql: String, schema: String) -> Self {
Self {
sql,
execution_schema: Some(schema),
}
}
}
/// Executes DDL statements against the database.
pub struct DdlExecutor<'a> {
client: &'a DatabaseClient,
}
impl<'a> DdlExecutor<'a> {
/// Creates a new DDL executor.
pub fn new(client: &'a DatabaseClient) -> Self {
Self { client }
}
/// Executes a list of DDL statements, stopping on first error.
pub async fn execute_strict(&self, statements: &[DdlStatement]) -> Result<()> {
let total = statements.len();
for (i, stmt) in statements.iter().enumerate() {
let preview = preview_sql(&stmt.sql);
info!("Executing DDL ({}/{}): {}", i + 1, total, preview);
if let Some(schema) = stmt.execution_schema.as_deref() {
self.client
.sql(&stmt.sql, schema)
.await
.context(DatabaseSnafu)?;
} else {
self.client
.sql_in_public(&stmt.sql)
.await
.context(DatabaseSnafu)?;
}
}
Ok(())
}
}
fn preview_sql(sql: &str) -> String {
let mut chars = sql.chars();
let preview: String = chars.by_ref().take(80).collect();
if chars.next().is_some() {
format!("{preview}...")
} else {
preview
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_statement_without_execution_schema_uses_public() {
let stmt = DdlStatement::new("CREATE DATABASE IF NOT EXISTS test_db".to_string());
assert_eq!(stmt.execution_schema, None);
}
#[test]
fn test_statement_with_execution_schema_preserves_context() {
let stmt = DdlStatement::with_execution_schema(
r#"CREATE TABLE IF NOT EXISTS "my""schema"."metrics" (ts TIMESTAMP TIME INDEX)"#
.to_string(),
r#"my"schema"#.to_string(),
);
assert_eq!(stmt.execution_schema.as_deref(), Some(r#"my"schema"#));
}
#[test]
fn test_preview_sql_truncates_at_char_boundary() {
let sql = format!(
"CREATE TABLE {} (ts TIMESTAMP TIME INDEX)",
"".repeat(100)
);
let preview = preview_sql(&sql);
assert!(preview.ends_with("..."));
assert!(preview.is_char_boundary(preview.len()));
}
}

View File

@@ -0,0 +1,804 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU64, Ordering};
use chrono::{DateTime, Utc};
use fs2::FileExt;
use serde::{Deserialize, Serialize};
use snafu::{IntoError, OptionExt, ResultExt};
use tokio::io::AsyncWriteExt;
use crate::data::import_v2::error::{
ImportStateIoSnafu, ImportStateLockedSnafu, ImportStateParseSnafu, ImportStateUnknownTaskSnafu,
Result,
};
use crate::data::path::encode_path_segment;
const IMPORT_STATE_ROOT: &str = ".greptime";
const IMPORT_STATE_DIR: &str = "import_state";
static IMPORT_STATE_TMP_ID: AtomicU64 = AtomicU64::new(0);
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub(crate) enum ImportTaskStatus {
Pending,
InProgress,
Completed,
Failed,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) struct ImportTaskKey {
pub(crate) chunk_id: u32,
pub(crate) schema: String,
}
impl ImportTaskKey {
pub(crate) fn new(chunk_id: u32, schema: impl Into<String>) -> Self {
Self {
chunk_id,
schema: schema.into(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) struct ImportTaskState {
pub(crate) chunk_id: u32,
pub(crate) schema: String,
pub(crate) status: ImportTaskStatus,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) error: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub(crate) struct ImportState {
pub(crate) snapshot_id: String,
pub(crate) target_addr: String,
pub(crate) catalog: String,
pub(crate) schemas: Vec<String>,
#[serde(default)]
pub(crate) ddl_completed: bool,
pub(crate) updated_at: DateTime<Utc>,
// Tasks are (chunk-schema) tuples and can reach the tens of thousands;
// linear scans here are accepted because per-task work is dominated by
// network I/O and an fsync, but if the bound grows further this should be
// backed by a HashMap<(chunk_id, schema), index> rebuilt after load.
pub(crate) tasks: Vec<ImportTaskState>,
}
impl ImportState {
pub(crate) fn new<I>(
snapshot_id: impl Into<String>,
target_addr: impl Into<String>,
catalog: impl Into<String>,
schemas: &[String],
tasks: I,
) -> Self
where
I: IntoIterator<Item = ImportTaskKey>,
{
Self {
snapshot_id: snapshot_id.into(),
target_addr: target_addr.into(),
catalog: catalog.into(),
schemas: canonical_schema_selection(schemas),
ddl_completed: false,
updated_at: Utc::now(),
tasks: tasks
.into_iter()
.map(|task| ImportTaskState {
chunk_id: task.chunk_id,
schema: task.schema,
status: ImportTaskStatus::Pending,
error: None,
})
.collect(),
}
}
pub(crate) fn mark_ddl_completed(&mut self) {
self.ddl_completed = true;
self.updated_at = Utc::now();
}
pub(crate) fn task_status(&self, chunk_id: u32, schema: &str) -> Option<ImportTaskStatus> {
self.tasks
.iter()
.find(|task| task.chunk_id == chunk_id && task.schema == schema)
.map(|task| task.status)
}
pub(crate) fn set_task_status(
&mut self,
chunk_id: u32,
schema: &str,
status: ImportTaskStatus,
error: Option<String>,
) -> Result<()> {
let task = self
.tasks
.iter_mut()
.find(|task| task.chunk_id == chunk_id && task.schema == schema)
.context(ImportStateUnknownTaskSnafu {
chunk_id,
schema: schema.to_string(),
})?;
task.status = status;
task.error = error;
self.updated_at = Utc::now();
Ok(())
}
}
#[derive(Debug)]
pub(crate) struct ImportStateLockGuard {
file: std::fs::File,
}
impl Drop for ImportStateLockGuard {
fn drop(&mut self) {
let _ = self.file.unlock();
}
}
pub(crate) fn default_state_path(
snapshot_id: &str,
target_addr: &str,
catalog: &str,
schemas: &[String],
) -> Option<PathBuf> {
let home = default_home_dir_with(|key| std::env::var_os(key));
let cwd = std::env::current_dir().ok();
default_state_path_with(
home.as_deref(),
cwd.as_deref(),
snapshot_id,
target_addr,
catalog,
schemas,
)
}
fn default_home_dir_with<F>(get: F) -> Option<PathBuf>
where
F: Fn(&str) -> Option<std::ffi::OsString>,
{
get("HOME")
.or_else(|| get("USERPROFILE"))
.map(PathBuf::from)
.or_else(|| {
let drive = get("HOMEDRIVE")?;
let path = get("HOMEPATH")?;
Some(PathBuf::from(drive).join(path))
})
}
fn default_state_path_with(
home: Option<&Path>,
cwd: Option<&Path>,
snapshot_id: &str,
target_addr: &str,
catalog: &str,
schemas: &[String],
) -> Option<PathBuf> {
let file_name = import_state_file_name(snapshot_id, target_addr, catalog, schemas);
match (home, cwd) {
(Some(home), _) => Some(
home.join(IMPORT_STATE_ROOT)
.join(IMPORT_STATE_DIR)
.join(file_name),
),
(None, Some(cwd)) => Some(cwd.join(file_name)),
(None, None) => None,
}
}
fn import_state_file_name(
snapshot_id: &str,
target_addr: &str,
catalog: &str,
schemas: &[String],
) -> String {
format!(
".import_state_{}_{}_{}.json",
encode_path_segment(snapshot_id),
encode_path_segment(target_addr),
import_identity_hash(catalog, schemas)
)
}
pub(crate) fn canonical_schema_selection(schemas: &[String]) -> Vec<String> {
let mut canonicalized = schemas
.iter()
.map(|schema| schema.to_ascii_lowercase())
.collect::<Vec<_>>();
canonicalized.sort();
canonicalized.dedup();
canonicalized
}
/// FNV-1a over `(catalog, schemas)`. The output is part of the persisted state
/// filename, so we cannot use `std::collections::hash_map::DefaultHasher` -
/// Rust does not guarantee its algorithm across releases, which would make a
/// state file written by one toolchain undiscoverable by another.
fn import_identity_hash(catalog: &str, schemas: &[String]) -> String {
const FNV_OFFSET: u64 = 0xcbf29ce484222325;
const FNV_PRIME: u64 = 0x100000001b3;
fn hash_bytes(mut hash: u64, bytes: &[u8]) -> u64 {
for byte in bytes {
hash ^= u64::from(*byte);
hash = hash.wrapping_mul(FNV_PRIME);
}
hash
}
let mut hash = FNV_OFFSET;
hash = hash_bytes(hash, catalog.as_bytes());
// 0xff cannot appear in valid UTF-8, so it works as an unambiguous
// field separator between adjacent identifiers.
hash = hash_bytes(hash, &[0xff]);
for schema in canonical_schema_selection(schemas) {
hash = hash_bytes(hash, schema.as_bytes());
hash = hash_bytes(hash, &[0xff]);
}
format!("{hash:016x}")
}
pub(crate) async fn load_import_state(path: &Path) -> Result<Option<ImportState>> {
match tokio::fs::read(path).await {
Ok(bytes) => {
let mut state: ImportState =
serde_json::from_slice(&bytes).context(ImportStateParseSnafu)?;
normalize_import_state_for_resume(&mut state);
Ok(Some(state))
}
Err(error) if error.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(source) => Err(source).context(ImportStateIoSnafu {
path: path.display().to_string(),
}),
}
}
/// Caller must hold the lock acquired via `try_acquire_import_state_lock`.
pub(crate) async fn save_import_state(path: &Path, state: &ImportState) -> Result<()> {
if let Some(parent) = path.parent() {
tokio::fs::create_dir_all(parent)
.await
.context(ImportStateIoSnafu {
path: parent.display().to_string(),
})?;
}
let bytes =
serde_json::to_vec_pretty(state).expect("ImportState should always be serializable");
let tmp_path = unique_tmp_path(path);
let mut file = tokio::fs::File::create(&tmp_path)
.await
.context(ImportStateIoSnafu {
path: tmp_path.display().to_string(),
})?;
file.write_all(&bytes).await.context(ImportStateIoSnafu {
path: tmp_path.display().to_string(),
})?;
file.sync_all().await.context(ImportStateIoSnafu {
path: tmp_path.display().to_string(),
})?;
// Close before rename; Windows forbids renaming an open file.
drop(file);
tokio::fs::rename(&tmp_path, path)
.await
.context(ImportStateIoSnafu {
path: path.display().to_string(),
})?;
sync_parent_dir(path).await?;
Ok(())
}
pub(crate) fn try_acquire_import_state_lock(path: &Path) -> Result<ImportStateLockGuard> {
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).context(ImportStateIoSnafu {
path: parent.display().to_string(),
})?;
}
let lock_path = import_state_lock_path(path);
let file = std::fs::OpenOptions::new()
.create(true)
.read(true)
.write(true)
.truncate(false)
.open(&lock_path)
.context(ImportStateIoSnafu {
path: lock_path.display().to_string(),
})?;
file.try_lock_exclusive().map_err(|error| {
if is_lock_contention(&error) {
ImportStateLockedSnafu {
path: lock_path.display().to_string(),
}
.build()
} else {
ImportStateIoSnafu {
path: lock_path.display().to_string(),
}
.into_error(error)
}
})?;
Ok(ImportStateLockGuard { file })
}
fn is_lock_contention(error: &std::io::Error) -> bool {
error.kind() == std::io::ErrorKind::WouldBlock
|| error.raw_os_error() == fs2::lock_contended_error().raw_os_error()
}
fn unique_tmp_path(path: &Path) -> PathBuf {
let pid = std::process::id();
let seq = IMPORT_STATE_TMP_ID.fetch_add(1, Ordering::Relaxed);
let file_name = path.file_name().unwrap_or_default().to_string_lossy();
path.with_file_name(format!("{file_name}.{pid}.{seq}.tmp"))
}
fn import_state_lock_path(path: &Path) -> PathBuf {
let file_name = path.file_name().unwrap_or_default().to_string_lossy();
path.with_file_name(format!("{file_name}.lock"))
}
fn normalize_import_state_for_resume(state: &mut ImportState) {
for task in &mut state.tasks {
if task.status == ImportTaskStatus::InProgress {
task.status = ImportTaskStatus::Pending;
task.error = None;
}
}
}
pub(crate) async fn delete_import_state(path: &Path) -> Result<()> {
match tokio::fs::remove_file(path).await {
Ok(()) => {
sync_parent_dir(path).await?;
Ok(())
}
Err(error) if error.kind() == std::io::ErrorKind::NotFound => Ok(()),
Err(source) => Err(source).context(ImportStateIoSnafu {
path: path.display().to_string(),
}),
}
}
#[cfg(unix)]
async fn sync_parent_dir(path: &Path) -> Result<()> {
let Some(parent) = path.parent() else {
return Ok(());
};
let dir = tokio::fs::File::open(parent)
.await
.context(ImportStateIoSnafu {
path: parent.display().to_string(),
})?;
dir.sync_all().await.context(ImportStateIoSnafu {
path: parent.display().to_string(),
})?;
Ok(())
}
#[cfg(not(unix))]
async fn sync_parent_dir(_path: &Path) -> Result<()> {
Ok(())
}
#[cfg(test)]
mod tests {
use std::process::Command;
use chrono::Utc;
use tempfile::tempdir;
use super::*;
const CHILD_LOCK_PATH_ENV: &str = "GREPTIME_IMPORT_STATE_LOCK_PATH";
const CHILD_LOCK_TEST: &str =
"data::import_v2::state::tests::test_try_acquire_import_state_lock_child_process";
fn schemas() -> Vec<String> {
vec!["public".to_string(), "analytics".to_string()]
}
fn tasks() -> Vec<ImportTaskKey> {
vec![
ImportTaskKey::new(1, "public"),
ImportTaskKey::new(2, "analytics"),
]
}
#[test]
fn test_import_state_new_initializes_pending_tasks() {
let state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&schemas(),
tasks(),
);
assert_eq!(state.snapshot_id, "snapshot-1");
assert_eq!(state.target_addr, "127.0.0.1:4000");
assert_eq!(state.catalog, "greptime");
assert_eq!(state.schemas, vec!["analytics", "public"]);
assert_eq!(state.tasks.len(), 2);
assert_eq!(state.tasks[0].status, ImportTaskStatus::Pending);
assert_eq!(state.tasks[1].status, ImportTaskStatus::Pending);
}
#[test]
fn test_set_task_status_updates_timestamp_and_error() {
let mut state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&schemas(),
tasks(),
);
let before = state.updated_at;
state.updated_at = Utc::now() - chrono::Duration::seconds(10);
state
.set_task_status(
1,
"public",
ImportTaskStatus::Failed,
Some("timeout".to_string()),
)
.unwrap();
assert_eq!(
state.task_status(1, "public"),
Some(ImportTaskStatus::Failed)
);
assert_eq!(state.tasks[0].error.as_deref(), Some("timeout"));
assert!(state.updated_at > before);
}
#[test]
fn test_set_task_status_rejects_unknown_task() {
let mut state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&schemas(),
tasks(),
);
let error = state
.set_task_status(99, "public", ImportTaskStatus::Completed, None)
.unwrap_err();
assert!(matches!(
error,
crate::data::import_v2::error::Error::ImportStateUnknownTask { chunk_id, schema, .. }
if chunk_id == 99 && schema == "public"
));
}
#[tokio::test]
async fn test_save_and_load_import_state_round_trip() {
let dir = tempdir().unwrap();
let path = dir.path().join("import_state.json");
let mut state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&schemas(),
tasks(),
);
state
.set_task_status(2, "analytics", ImportTaskStatus::Completed, None)
.unwrap();
save_import_state(&path, &state).await.unwrap();
let loaded = load_import_state(&path).await.unwrap().unwrap();
assert_eq!(loaded.snapshot_id, state.snapshot_id);
assert_eq!(loaded.target_addr, state.target_addr);
assert_eq!(loaded.catalog, state.catalog);
assert_eq!(loaded.schemas, state.schemas);
assert_eq!(loaded.tasks, state.tasks);
}
#[tokio::test]
async fn test_save_import_state_overwrites_existing_file() {
let dir = tempdir().unwrap();
let path = dir.path().join("import_state.json");
let mut state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&schemas(),
tasks(),
);
save_import_state(&path, &state).await.unwrap();
state
.set_task_status(1, "public", ImportTaskStatus::Completed, None)
.unwrap();
save_import_state(&path, &state).await.unwrap();
let loaded = load_import_state(&path).await.unwrap().unwrap();
assert_eq!(
loaded.task_status(1, "public"),
Some(ImportTaskStatus::Completed)
);
}
#[test]
fn test_load_import_state_resets_in_progress_to_pending() {
let mut state = ImportState::new(
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&schemas(),
tasks(),
);
state
.set_task_status(
2,
"analytics",
ImportTaskStatus::InProgress,
Some("running".to_string()),
)
.unwrap();
normalize_import_state_for_resume(&mut state);
assert_eq!(
state.task_status(1, "public"),
Some(ImportTaskStatus::Pending)
);
assert_eq!(
state.task_status(2, "analytics"),
Some(ImportTaskStatus::Pending)
);
assert_eq!(state.tasks[1].error, None);
}
#[test]
fn test_unique_tmp_path_generates_distinct_paths() {
let dir = tempdir().unwrap();
let path = dir.path().join("import_state.json");
let first = unique_tmp_path(&path);
let second = unique_tmp_path(&path);
assert_ne!(first, second);
assert!(first.starts_with(dir.path()));
assert!(second.starts_with(dir.path()));
assert!(
first
.file_name()
.unwrap()
.to_string_lossy()
.ends_with(".tmp")
);
assert!(
second
.file_name()
.unwrap()
.to_string_lossy()
.ends_with(".tmp")
);
}
#[test]
fn test_lock_contention_detection_accepts_platform_error() {
let error = fs2::lock_contended_error();
assert!(is_lock_contention(&error));
}
#[test]
fn test_try_acquire_import_state_lock_rejects_second_holder() {
let dir = tempdir().unwrap();
let path = dir.path().join("import_state.json");
let _first = try_acquire_import_state_lock(&path).unwrap();
// Import state locking guards concurrent CLI processes, so validate cross-process exclusion.
let output = Command::new(std::env::current_exe().unwrap())
.arg(CHILD_LOCK_TEST)
.arg("--ignored")
.arg("--exact")
.env(CHILD_LOCK_PATH_ENV, &path)
.output()
.unwrap();
assert!(
output.status.success(),
"child lock test failed\nstdout:\n{}\nstderr:\n{}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
);
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(
stdout.contains("1 passed"),
"child lock test did not run the expected ignored test\nstdout:\n{stdout}"
);
}
#[test]
#[ignore = "spawned by test_try_acquire_import_state_lock_rejects_second_holder"]
fn test_try_acquire_import_state_lock_child_process() {
let path = std::env::var_os(CHILD_LOCK_PATH_ENV)
.expect("child lock path must be set by the parent test");
let path = PathBuf::from(path);
let error = try_acquire_import_state_lock(&path).unwrap_err();
assert!(matches!(
error,
crate::data::import_v2::error::Error::ImportStateLocked { .. }
));
}
#[tokio::test]
async fn test_delete_import_state_ignores_missing_file() {
let dir = tempdir().unwrap();
let path = dir.path().join("missing.json");
delete_import_state(&path).await.unwrap();
}
#[test]
fn test_default_state_path_prefers_home_and_encodes_snapshot_id() {
let home = tempdir().unwrap();
let cwd = tempdir().unwrap();
let path = default_state_path_with(
Some(home.path()),
Some(cwd.path()),
"../snapshot",
"127.0.0.1:4000",
"greptime",
&schemas(),
)
.unwrap();
assert_eq!(
path.parent().unwrap(),
home.path().join(IMPORT_STATE_ROOT).join(IMPORT_STATE_DIR)
);
let file_name = path.file_name().unwrap().to_string_lossy();
assert!(file_name.starts_with(".import_state_%2E%2E%2Fsnapshot_127%2E0%2E0%2E1%3A4000_"));
assert!(file_name.ends_with(".json"));
}
#[test]
fn test_default_state_path_falls_back_to_cwd_when_home_missing() {
let cwd = tempdir().unwrap();
let path = default_state_path_with(
None,
Some(cwd.path()),
"snapshot-1",
"target-a",
"greptime",
&schemas(),
)
.unwrap();
assert_eq!(path.parent().unwrap(), cwd.path());
let file_name = path.file_name().unwrap().to_string_lossy();
assert!(file_name.starts_with(".import_state_snapshot-1_target-a_"));
assert!(file_name.ends_with(".json"));
}
#[test]
fn test_default_state_path_isolated_by_target_addr() {
let cwd = tempdir().unwrap();
let first = default_state_path_with(
None,
Some(cwd.path()),
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&schemas(),
)
.unwrap();
let second = default_state_path_with(
None,
Some(cwd.path()),
"snapshot-1",
"127.0.0.1:4001",
"greptime",
&schemas(),
)
.unwrap();
assert_ne!(first, second);
}
#[test]
fn test_default_state_path_isolated_by_catalog_and_schemas() {
let cwd = tempdir().unwrap();
let public_only = vec!["public".to_string()];
let analytics_only = vec!["analytics".to_string()];
let first = default_state_path_with(
None,
Some(cwd.path()),
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&public_only,
)
.unwrap();
let second = default_state_path_with(
None,
Some(cwd.path()),
"snapshot-1",
"127.0.0.1:4000",
"other",
&public_only,
)
.unwrap();
let third = default_state_path_with(
None,
Some(cwd.path()),
"snapshot-1",
"127.0.0.1:4000",
"greptime",
&analytics_only,
)
.unwrap();
assert_ne!(first, second);
assert_ne!(first, third);
}
#[test]
fn test_default_home_dir_prefers_home() {
let detected = default_home_dir_with(|key| match key {
"HOME" => Some(std::ffi::OsString::from("/tmp/home")),
"USERPROFILE" => Some(std::ffi::OsString::from("/tmp/userprofile")),
_ => None,
});
assert_eq!(detected, Some(PathBuf::from("/tmp/home")));
}
#[test]
fn test_default_home_dir_falls_back_to_userprofile() {
let detected = default_home_dir_with(|key| match key {
"USERPROFILE" => Some(std::ffi::OsString::from("/tmp/userprofile")),
_ => None,
});
assert_eq!(detected, Some(PathBuf::from("/tmp/userprofile")));
}
#[test]
fn test_default_home_dir_falls_back_to_home_drive_and_path() {
let detected = default_home_dir_with(|key| match key {
"HOMEDRIVE" => Some(std::ffi::OsString::from("/tmp")),
"HOMEPATH" => Some(std::ffi::OsString::from("windows-home")),
_ => None,
});
assert_eq!(detected, Some(PathBuf::from("/tmp").join("windows-home")));
}
}

89
src/cli/src/data/path.rs Normal file
View File

@@ -0,0 +1,89 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Shared path helpers for export/import data files.
use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR};
pub(crate) fn ddl_path_for_schema(schema: &str) -> String {
format!(
"{}/{}/{}.sql",
SCHEMA_DIR,
DDL_DIR,
encode_path_segment(schema)
)
}
pub(crate) fn data_dir_for_schema_chunk(schema: &str, chunk_id: u32) -> String {
format!("data/{}/{}/", encode_path_segment(schema), chunk_id)
}
pub(crate) fn encode_path_segment(value: &str) -> String {
let mut encoded = String::with_capacity(value.len());
for byte in value.bytes() {
match byte {
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' => {
encoded.push(byte as char);
}
_ => {
encoded.push('%');
encoded.push(hex_char(byte >> 4));
encoded.push(hex_char(byte & 0x0F));
}
}
}
encoded
}
fn hex_char(nibble: u8) -> char {
match nibble {
0..=9 => (b'0' + nibble) as char,
10..=15 => (b'A' + (nibble - 10)) as char,
_ => unreachable!("nibble must be in 0..=15"),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encode_path_segment_preserves_safe_ascii() {
assert_eq!(encode_path_segment("test_db"), "test_db");
}
#[test]
fn test_encode_path_segment_escapes_path_traversal_chars() {
assert_eq!(encode_path_segment("../evil"), "%2E%2E%2Fevil");
assert_eq!(encode_path_segment(r"..\\evil"), "%2E%2E%5C%5Cevil");
}
#[test]
fn test_ddl_path_for_schema_encodes_schema_segment() {
assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql");
assert_eq!(
ddl_path_for_schema("../evil"),
"schema/ddl/%2E%2E%2Fevil.sql"
);
}
#[test]
fn test_data_dir_for_schema_chunk_encodes_schema_segment() {
assert_eq!(data_dir_for_schema_chunk("public", 1), "data/public/1/");
assert_eq!(
data_dir_for_schema_chunk("../evil", 7),
"data/%2E%2E%2Fevil/7/"
);
}
}

View File

@@ -0,0 +1,863 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Storage abstraction for Export/Import V2.
//!
//! This module provides a unified interface for reading and writing snapshot data
//! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem).
use std::collections::BTreeSet;
use async_trait::async_trait;
use futures::TryStreamExt;
use object_store::services::{Azblob, Fs, Gcs, Oss, S3};
use object_store::util::{with_instrument_layers, with_retry_layers};
use object_store::{
AzblobConnection, ErrorKind, GcsConnection, ObjectStore, OssConnection, S3Connection,
};
use snafu::ResultExt;
use url::Url;
use crate::common::ObjectStoreConfig;
use crate::data::export_v2::error::{
BuildObjectStoreSnafu, InvalidUriSnafu, ManifestParseSnafu, ManifestSerializeSnafu, Result,
SnapshotNotFoundSnafu, StorageOperationSnafu, TextDecodeSnafu, UnsupportedSchemeSnafu,
UrlParseSnafu,
};
use crate::data::export_v2::manifest::{MANIFEST_FILE, Manifest};
#[cfg(test)]
use crate::data::export_v2::schema::SchemaDefinition;
use crate::data::export_v2::schema::{SCHEMA_DIR, SCHEMAS_FILE, SchemaSnapshot};
struct RemoteLocation {
bucket_or_container: String,
root: String,
}
/// URI schemes supported for snapshot storage.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StorageScheme {
/// Amazon S3.
S3,
/// Alibaba Cloud OSS.
Oss,
/// Google Cloud Storage.
Gcs,
/// Azure Blob Storage.
Azblob,
/// Local filesystem (file://).
File,
}
impl StorageScheme {
/// Parses storage scheme from URI.
pub fn from_uri(uri: &str) -> Result<Self> {
let url = Url::parse(uri).context(UrlParseSnafu)?;
match url.scheme() {
"s3" => Ok(Self::S3),
"oss" => Ok(Self::Oss),
"gs" | "gcs" => Ok(Self::Gcs),
"azblob" => Ok(Self::Azblob),
"file" => Ok(Self::File),
scheme => UnsupportedSchemeSnafu { scheme }.fail(),
}
}
}
/// Extracts bucket/container and root path from a URI.
fn extract_remote_location_with_root_policy(
uri: &str,
allow_empty_root: bool,
) -> Result<RemoteLocation> {
let url = Url::parse(uri).context(UrlParseSnafu)?;
let bucket_or_container = url.host_str().unwrap_or("").to_string();
if bucket_or_container.is_empty() {
return InvalidUriSnafu {
uri,
reason: "URI must include bucket/container in host",
}
.fail();
}
let root = url.path().trim_start_matches('/').to_string();
if root.is_empty() && !allow_empty_root {
return InvalidUriSnafu {
uri,
reason: "snapshot URI must include a non-empty path after the bucket/container",
}
.fail();
}
Ok(RemoteLocation {
bucket_or_container,
root,
})
}
/// Validates that a URI has a proper scheme.
///
/// Rejects bare paths (e.g., `/tmp/backup`, `./backup`) because:
/// - Schema export (CLI) and data export (server) run in different processes
/// - Using bare paths would split the snapshot across machines
///
/// Supported URI schemes:
/// - `s3://bucket/path` - Amazon S3
/// - `oss://bucket/path` - Alibaba Cloud OSS
/// - `gs://bucket/path` - Google Cloud Storage
/// - `azblob://container/path` - Azure Blob Storage
/// - `file:///absolute/path` - Local filesystem
pub fn validate_uri(uri: &str) -> Result<StorageScheme> {
// Must have a scheme
if !uri.contains("://") {
return InvalidUriSnafu {
uri,
reason: "URI must have a scheme (e.g., s3://, file://). Bare paths are not supported.",
}
.fail();
}
StorageScheme::from_uri(uri)
}
fn schema_index_path() -> String {
format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE)
}
/// Extracts the absolute filesystem path from a file:// URI.
fn extract_file_path_from_uri(uri: &str) -> Result<String> {
let url = Url::parse(uri).context(UrlParseSnafu)?;
match url.host_str() {
Some(host) if !host.is_empty() && host != "localhost" => InvalidUriSnafu {
uri,
reason: "file:// URI must use an absolute path like file:///tmp/backup",
}
.fail(),
_ => url
.to_file_path()
.map_err(|_| {
InvalidUriSnafu {
uri,
reason: "file:// URI must use an absolute path like file:///tmp/backup",
}
.build()
})
.map(|path| path.to_string_lossy().into_owned()),
}
}
async fn ensure_snapshot_exists(storage: &OpenDalStorage) -> Result<()> {
if storage.exists().await? {
Ok(())
} else {
SnapshotNotFoundSnafu {
uri: storage.target_uri.as_str(),
}
.fail()
}
}
/// Snapshot storage abstraction.
///
/// Provides operations for reading and writing snapshot data to various storage backends.
#[async_trait]
pub trait SnapshotStorage: Send + Sync {
/// Checks if a snapshot exists at this location (manifest.json exists).
async fn exists(&self) -> Result<bool>;
/// Reads the manifest file.
async fn read_manifest(&self) -> Result<Manifest>;
/// Writes the manifest file.
async fn write_manifest(&self, manifest: &Manifest) -> Result<()>;
/// Writes the schema index to schema/schemas.json.
async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()>;
/// Writes a text file to a relative path under the snapshot root.
async fn write_text(&self, path: &str, content: &str) -> Result<()>;
/// Reads a text file from a relative path under the snapshot root.
async fn read_text(&self, path: &str) -> Result<String>;
/// Creates a directory-like prefix under the snapshot root when needed by the backend.
async fn create_dir_all(&self, path: &str) -> Result<()>;
/// Lists files recursively under a relative prefix.
async fn list_files_recursive(&self, prefix: &str) -> Result<Vec<String>>;
/// Deletes the entire snapshot (for --force).
async fn delete_snapshot(&self) -> Result<()>;
}
/// OpenDAL-based implementation of SnapshotStorage.
pub struct OpenDalStorage {
object_store: ObjectStore,
target_uri: String,
}
impl OpenDalStorage {
fn new_operator_rooted(object_store: ObjectStore, target_uri: &str) -> Self {
Self {
object_store,
target_uri: target_uri.to_string(),
}
}
fn finish_local_store(object_store: ObjectStore) -> ObjectStore {
with_instrument_layers(object_store, false)
}
fn finish_remote_store(object_store: ObjectStore) -> ObjectStore {
with_instrument_layers(with_retry_layers(object_store), false)
}
fn ensure_backend_enabled(uri: &str, enabled: bool, reason: &'static str) -> Result<()> {
if enabled {
Ok(())
} else {
InvalidUriSnafu { uri, reason }.fail()
}
}
fn validate_remote_config<E: std::fmt::Display>(
uri: &str,
backend: &str,
result: std::result::Result<(), E>,
) -> Result<()> {
result.map_err(|error| {
InvalidUriSnafu {
uri,
reason: format!("invalid {} config: {}", backend, error),
}
.build()
})
}
/// Creates a new storage from a file:// URI.
pub fn from_file_uri(uri: &str) -> Result<Self> {
let path = extract_file_path_from_uri(uri)?;
let builder = Fs::default().root(&path);
let object_store = ObjectStore::new(builder)
.context(BuildObjectStoreSnafu)?
.finish();
Ok(Self::new_operator_rooted(
Self::finish_local_store(object_store),
uri,
))
}
fn from_file_uri_with_config(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
if storage.enable_s3 || storage.enable_oss || storage.enable_gcs || storage.enable_azblob {
return InvalidUriSnafu {
uri,
reason: "file:// cannot be used with remote storage flags",
}
.fail();
}
Self::from_file_uri(uri)
}
fn from_s3_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
Self::from_s3_uri_with_root_policy(uri, storage, false)
}
fn from_s3_uri_with_root_policy(
uri: &str,
storage: &ObjectStoreConfig,
allow_empty_root: bool,
) -> Result<Self> {
Self::ensure_backend_enabled(
uri,
storage.enable_s3,
"s3:// requires --s3 and related options",
)?;
let location = extract_remote_location_with_root_policy(uri, allow_empty_root)?;
let mut config = storage.s3.clone();
config.s3_bucket = location.bucket_or_container;
config.s3_root = location.root;
Self::validate_remote_config(uri, "s3", config.validate())?;
let conn: S3Connection = config.into();
let object_store = ObjectStore::new(S3::from(&conn))
.context(BuildObjectStoreSnafu)?
.finish();
Ok(Self::new_operator_rooted(
Self::finish_remote_store(object_store),
uri,
))
}
fn from_oss_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
Self::from_oss_uri_with_root_policy(uri, storage, false)
}
fn from_oss_uri_with_root_policy(
uri: &str,
storage: &ObjectStoreConfig,
allow_empty_root: bool,
) -> Result<Self> {
Self::ensure_backend_enabled(
uri,
storage.enable_oss,
"oss:// requires --oss and related options",
)?;
let location = extract_remote_location_with_root_policy(uri, allow_empty_root)?;
let mut config = storage.oss.clone();
config.oss_bucket = location.bucket_or_container;
config.oss_root = location.root;
Self::validate_remote_config(uri, "oss", config.validate())?;
let conn: OssConnection = config.into();
let object_store = ObjectStore::new(Oss::from(&conn))
.context(BuildObjectStoreSnafu)?
.finish();
Ok(Self::new_operator_rooted(
Self::finish_remote_store(object_store),
uri,
))
}
fn from_gcs_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
Self::from_gcs_uri_with_root_policy(uri, storage, false)
}
fn from_gcs_uri_with_root_policy(
uri: &str,
storage: &ObjectStoreConfig,
allow_empty_root: bool,
) -> Result<Self> {
Self::ensure_backend_enabled(
uri,
storage.enable_gcs,
"gs:// or gcs:// requires --gcs and related options",
)?;
let location = extract_remote_location_with_root_policy(uri, allow_empty_root)?;
let mut config = storage.gcs.clone();
config.gcs_bucket = location.bucket_or_container;
config.gcs_root = location.root;
// GCS validate() rejects empty root, unlike S3/OSS/Azblob.
if allow_empty_root && config.gcs_root.is_empty() {
Self::validate_gcs_parent_config(uri, &config)?;
} else {
Self::validate_remote_config(uri, "gcs", config.validate())?;
}
let conn: GcsConnection = config.into();
let object_store = ObjectStore::new(Gcs::from(&conn))
.context(BuildObjectStoreSnafu)?
.finish();
Ok(Self::new_operator_rooted(
Self::finish_remote_store(object_store),
uri,
))
}
fn validate_gcs_parent_config(
uri: &str,
config: &crate::common::PrefixedGcsConnection,
) -> Result<()> {
if config.gcs_bucket.is_empty() {
return InvalidUriSnafu {
uri,
reason: "invalid gcs config: GCS bucket must be set when --gcs is enabled.",
}
.fail();
}
if config.gcs_scope.is_empty() {
return InvalidUriSnafu {
uri,
reason: "invalid gcs config: GCS scope must be set when --gcs is enabled.",
}
.fail();
}
Ok(())
}
fn from_azblob_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
Self::from_azblob_uri_with_root_policy(uri, storage, false)
}
fn from_azblob_uri_with_root_policy(
uri: &str,
storage: &ObjectStoreConfig,
allow_empty_root: bool,
) -> Result<Self> {
Self::ensure_backend_enabled(
uri,
storage.enable_azblob,
"azblob:// requires --azblob and related options",
)?;
let location = extract_remote_location_with_root_policy(uri, allow_empty_root)?;
let mut config = storage.azblob.clone();
config.azblob_container = location.bucket_or_container;
config.azblob_root = location.root;
Self::validate_remote_config(uri, "azblob", config.validate())?;
let conn: AzblobConnection = config.into();
let object_store = ObjectStore::new(Azblob::from(&conn))
.context(BuildObjectStoreSnafu)?
.finish();
Ok(Self::new_operator_rooted(
Self::finish_remote_store(object_store),
uri,
))
}
/// Creates a new storage from a URI and object store config.
pub fn from_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
match StorageScheme::from_uri(uri)? {
StorageScheme::File => Self::from_file_uri_with_config(uri, storage),
StorageScheme::S3 => Self::from_s3_uri(uri, storage),
StorageScheme::Oss => Self::from_oss_uri(uri, storage),
StorageScheme::Gcs => Self::from_gcs_uri(uri, storage),
StorageScheme::Azblob => Self::from_azblob_uri(uri, storage),
}
}
/// Creates storage rooted at a snapshot parent URI.
///
/// Parent-oriented commands such as `export-v2 list` may scan bucket/container
/// roots. Snapshot-oriented commands must keep using `from_uri`, which rejects
/// empty remote roots to avoid unsafe snapshot operations at bucket scope.
pub fn from_parent_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
match StorageScheme::from_uri(uri)? {
StorageScheme::File => Self::from_file_uri_with_config(uri, storage),
StorageScheme::S3 => Self::from_s3_uri_with_root_policy(uri, storage, true),
StorageScheme::Oss => Self::from_oss_uri_with_root_policy(uri, storage, true),
StorageScheme::Gcs => Self::from_gcs_uri_with_root_policy(uri, storage, true),
StorageScheme::Azblob => Self::from_azblob_uri_with_root_policy(uri, storage, true),
}
}
/// Reads a file as bytes.
async fn read_file(&self, path: &str) -> Result<Vec<u8>> {
let data = self
.object_store
.read(path)
.await
.context(StorageOperationSnafu {
operation: format!("read {}", path),
})?;
Ok(data.to_vec())
}
/// Reads a file as bytes if it exists.
pub(crate) async fn read_file_if_exists(&self, path: &str) -> Result<Option<Vec<u8>>> {
match self.object_store.read(path).await {
Ok(data) => Ok(Some(data.to_vec())),
Err(error) if error.kind() == ErrorKind::NotFound => Ok(None),
Err(error) => Err(error).context(StorageOperationSnafu {
operation: format!("read {}", path),
}),
}
}
/// Writes bytes to a file.
async fn write_file(&self, path: &str, data: Vec<u8>) -> Result<()> {
self.object_store
.write(path, data)
.await
.map(|_| ())
.context(StorageOperationSnafu {
operation: format!("write {}", path),
})
}
/// Checks if a file exists using stat.
async fn file_exists(&self, path: &str) -> Result<bool> {
match self.object_store.stat(path).await {
Ok(_) => Ok(true),
Err(e) if e.kind() == object_store::ErrorKind::NotFound => Ok(false),
Err(e) => Err(e).context(StorageOperationSnafu {
operation: format!("check exists {}", path),
}),
}
}
/// Lists direct child directory names under the storage root.
pub(crate) async fn list_direct_child_dirs(&self) -> Result<Vec<String>> {
let mut lister = match self.object_store.lister_with("/").recursive(false).await {
Ok(lister) => lister,
Err(error) if error.kind() == ErrorKind::NotFound => return Ok(Vec::new()),
Err(error) => {
return Err(error).context(StorageOperationSnafu {
operation: "list /",
});
}
};
let mut dirs = BTreeSet::new();
while let Some(entry) = lister.try_next().await.context(StorageOperationSnafu {
operation: "list /",
})? {
let path = entry.path().trim_matches('/');
if path.is_empty() {
continue;
}
if entry.metadata().is_dir()
&& let Some(name) = path.split('/').next()
{
dirs.insert(name.to_string());
}
}
Ok(dirs.into_iter().collect())
}
#[cfg(test)]
pub async fn read_schema(&self) -> Result<SchemaSnapshot> {
let schemas_path = schema_index_path();
let schemas: Vec<SchemaDefinition> = if self.file_exists(&schemas_path).await? {
let data = self.read_file(&schemas_path).await?;
serde_json::from_slice(&data).context(ManifestParseSnafu)?
} else {
vec![]
};
Ok(SchemaSnapshot { schemas })
}
}
#[async_trait]
impl SnapshotStorage for OpenDalStorage {
async fn exists(&self) -> Result<bool> {
self.file_exists(MANIFEST_FILE).await
}
async fn read_manifest(&self) -> Result<Manifest> {
ensure_snapshot_exists(self).await?;
let data = self.read_file(MANIFEST_FILE).await?;
serde_json::from_slice(&data).context(ManifestParseSnafu)
}
async fn write_manifest(&self, manifest: &Manifest) -> Result<()> {
let data = serde_json::to_vec_pretty(manifest).context(ManifestSerializeSnafu)?;
self.write_file(MANIFEST_FILE, data).await
}
async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()> {
let schemas_path = schema_index_path();
let schemas_data =
serde_json::to_vec_pretty(&schema.schemas).context(ManifestSerializeSnafu)?;
self.write_file(&schemas_path, schemas_data).await
}
async fn write_text(&self, path: &str, content: &str) -> Result<()> {
self.write_file(path, content.as_bytes().to_vec()).await
}
async fn read_text(&self, path: &str) -> Result<String> {
let data = self.read_file(path).await?;
String::from_utf8(data).context(TextDecodeSnafu)
}
async fn create_dir_all(&self, path: &str) -> Result<()> {
self.object_store
.create_dir(path)
.await
.context(StorageOperationSnafu {
operation: format!("create dir {}", path),
})
}
async fn list_files_recursive(&self, prefix: &str) -> Result<Vec<String>> {
let mut lister = match self.object_store.lister_with(prefix).recursive(true).await {
Ok(lister) => lister,
Err(error) if error.kind() == ErrorKind::NotFound => return Ok(Vec::new()),
Err(error) => {
return Err(error).context(StorageOperationSnafu {
operation: format!("list {}", prefix),
});
}
};
let mut files = Vec::new();
while let Some(entry) = lister.try_next().await.context(StorageOperationSnafu {
operation: format!("list {}", prefix),
})? {
if entry.metadata().is_dir() {
continue;
}
files.push(entry.path().to_string());
}
Ok(files)
}
async fn delete_snapshot(&self) -> Result<()> {
self.object_store
.delete_with("/")
.recursive(true)
.await
.context(StorageOperationSnafu {
operation: "delete snapshot",
})
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::path::Path;
use object_store::ObjectStore;
use object_store::services::Fs;
use tempfile::tempdir;
use url::Url;
use super::*;
use crate::data::export_v2::manifest::{DataFormat, TimeRange};
use crate::data::export_v2::schema::SchemaDefinition;
fn make_storage_with_rooted_fs(dir: &std::path::Path) -> OpenDalStorage {
let object_store = ObjectStore::new(Fs::default().root(dir.to_str().unwrap()))
.unwrap()
.finish();
OpenDalStorage::new_operator_rooted(
OpenDalStorage::finish_local_store(object_store),
Url::from_directory_path(dir).unwrap().as_ref(),
)
}
#[test]
fn test_validate_uri_valid() {
assert_eq!(validate_uri("s3://bucket/path").unwrap(), StorageScheme::S3);
assert_eq!(
validate_uri("oss://bucket/path").unwrap(),
StorageScheme::Oss
);
assert_eq!(
validate_uri("gs://bucket/path").unwrap(),
StorageScheme::Gcs
);
assert_eq!(
validate_uri("gcs://bucket/path").unwrap(),
StorageScheme::Gcs
);
assert_eq!(
validate_uri("azblob://container/path").unwrap(),
StorageScheme::Azblob
);
assert_eq!(
validate_uri("file:///tmp/backup").unwrap(),
StorageScheme::File
);
}
#[test]
fn test_validate_uri_invalid() {
// Bare paths should be rejected
assert!(validate_uri("/tmp/backup").is_err());
assert!(validate_uri("./backup").is_err());
assert!(validate_uri("backup").is_err());
// Unknown schemes
assert!(validate_uri("ftp://server/path").is_err());
}
#[test]
fn test_extract_remote_location_requires_non_empty_root() {
assert!(extract_remote_location_with_root_policy("s3://bucket", false).is_err());
assert!(extract_remote_location_with_root_policy("s3://bucket/", false).is_err());
assert!(extract_remote_location_with_root_policy("oss://bucket", false).is_err());
assert!(extract_remote_location_with_root_policy("gs://bucket", false).is_err());
assert!(extract_remote_location_with_root_policy("azblob://container", false).is_err());
}
#[test]
fn test_extract_remote_location_allows_empty_root_when_permitted() {
let location = extract_remote_location_with_root_policy("s3://bucket", true).unwrap();
assert_eq!(location.bucket_or_container, "bucket");
assert_eq!(location.root, "");
let location =
extract_remote_location_with_root_policy("azblob://container/", true).unwrap();
assert_eq!(location.bucket_or_container, "container");
assert_eq!(location.root, "");
}
#[test]
fn test_parent_storage_allows_s3_bucket_root() {
let mut storage = ObjectStoreConfig {
enable_s3: true,
..Default::default()
};
storage.s3.s3_region = Some("us-east-1".to_string());
assert!(OpenDalStorage::from_uri("s3://bucket", &storage).is_err());
assert!(OpenDalStorage::from_parent_uri("s3://bucket", &storage).is_ok());
}
#[cfg(not(windows))]
#[test]
fn test_extract_path_from_uri_unix_examples() {
assert_eq!(
extract_file_path_from_uri("file:///tmp/backup").unwrap(),
"/tmp/backup"
);
assert_eq!(
extract_file_path_from_uri("file://localhost/tmp/backup").unwrap(),
"/tmp/backup"
);
assert_eq!(
extract_file_path_from_uri("file:///tmp/my%20backup").unwrap(),
"/tmp/my backup"
);
assert_eq!(
extract_file_path_from_uri("file://localhost/tmp/my%20backup").unwrap(),
"/tmp/my backup"
);
}
#[test]
fn test_extract_file_path_from_uri_rejects_file_host() {
assert!(extract_file_path_from_uri("file://tmp/backup").is_err());
}
#[test]
fn test_extract_file_path_from_uri_round_trips_directory_url() {
let dir = tempdir().unwrap();
let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
let path = extract_file_path_from_uri(&uri).unwrap();
assert_eq!(Path::new(&path), dir.path());
}
#[tokio::test]
async fn test_read_manifest_reports_requested_uri() {
let dir = tempdir().unwrap();
let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
let storage = OpenDalStorage::from_file_uri(&uri).unwrap();
let error = storage.read_manifest().await.unwrap_err().to_string();
assert!(error.contains(uri.as_str()));
}
#[tokio::test]
async fn test_manifest_round_trip() {
let dir = tempdir().unwrap();
let storage = make_storage_with_rooted_fs(dir.path());
let manifest = Manifest::new_full(
"greptime".to_string(),
vec!["public".to_string()],
TimeRange::unbounded(),
DataFormat::Parquet,
);
storage.write_manifest(&manifest).await.unwrap();
let loaded = storage.read_manifest().await.unwrap();
assert_eq!(loaded.catalog, manifest.catalog);
assert_eq!(loaded.schemas, manifest.schemas);
assert_eq!(loaded.schema_only, manifest.schema_only);
assert_eq!(loaded.format, manifest.format);
assert_eq!(loaded.snapshot_id, manifest.snapshot_id);
}
#[tokio::test]
async fn test_schema_round_trip() {
let dir = tempdir().unwrap();
let storage = make_storage_with_rooted_fs(dir.path());
let mut snapshot = SchemaSnapshot::new();
snapshot.add_schema(SchemaDefinition {
catalog: "greptime".to_string(),
name: "test_db".to_string(),
options: HashMap::from([("ttl".to_string(), "7d".to_string())]),
});
storage.write_schema(&snapshot).await.unwrap();
let loaded = storage.read_schema().await.unwrap();
assert_eq!(loaded, snapshot);
}
#[tokio::test]
async fn test_text_round_trip() {
let dir = tempdir().unwrap();
let storage = make_storage_with_rooted_fs(dir.path());
let content = "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX);";
storage
.write_text("schema/ddl/public.sql", content)
.await
.unwrap();
let loaded = storage.read_text("schema/ddl/public.sql").await.unwrap();
assert_eq!(loaded, content);
}
#[tokio::test]
async fn test_read_text_rejects_invalid_utf8() {
let dir = tempdir().unwrap();
let storage = make_storage_with_rooted_fs(dir.path());
storage
.write_file("schema/ddl/public.sql", vec![0xff, 0xfe, 0xfd])
.await
.unwrap();
let error = storage
.read_text("schema/ddl/public.sql")
.await
.unwrap_err();
assert!(error.to_string().contains("UTF-8"));
}
#[tokio::test]
async fn test_exists_follows_manifest_presence() {
let dir = tempdir().unwrap();
let storage = make_storage_with_rooted_fs(dir.path());
assert!(!storage.exists().await.unwrap());
storage
.write_manifest(&Manifest::new_schema_only(
"greptime".to_string(),
vec!["public".to_string()],
))
.await
.unwrap();
assert!(storage.exists().await.unwrap());
}
#[tokio::test]
async fn test_delete_snapshot_only_removes_rooted_contents() {
let parent = tempdir().unwrap();
let snapshot_root = parent.path().join("snapshot");
let sibling = parent.path().join("sibling");
std::fs::create_dir_all(&snapshot_root).unwrap();
std::fs::create_dir_all(&sibling).unwrap();
std::fs::write(snapshot_root.join("manifest.json"), b"{}").unwrap();
std::fs::write(sibling.join("keep.txt"), b"keep").unwrap();
let storage = make_storage_with_rooted_fs(&snapshot_root);
storage.delete_snapshot().await.unwrap();
assert!(!snapshot_root.join("manifest.json").exists());
assert!(sibling.join("keep.txt").exists());
}
}

40
src/cli/src/data/sql.rs Normal file
View File

@@ -0,0 +1,40 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Shared SQL escaping helpers for CLI-generated statements.
pub(crate) fn escape_sql_literal(value: &str) -> String {
value.replace('\'', "''")
}
pub(crate) fn escape_sql_identifier(value: &str) -> String {
value.replace('"', "\"\"")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_escape_sql_literal_escapes_single_quotes() {
assert_eq!(escape_sql_literal("test_db"), "test_db");
assert_eq!(escape_sql_literal("te'st"), "te''st");
}
#[test]
fn test_escape_sql_identifier_escapes_double_quotes() {
assert_eq!(escape_sql_identifier("test_db"), "test_db");
assert_eq!(escape_sql_identifier(r#"te"st"#), r#"te""st"#);
}
}

View File

@@ -36,6 +36,7 @@ pub struct DatabaseClient {
auth_header: Option<String>,
timeout: Duration,
proxy: Option<reqwest::Proxy>,
no_proxy: bool,
}
pub fn parse_proxy_opts(
@@ -61,6 +62,7 @@ impl DatabaseClient {
auth_basic: Option<String>,
timeout: Duration,
proxy: Option<reqwest::Proxy>,
no_proxy: bool,
) -> Self {
let auth_header = if let Some(basic) = auth_basic {
let encoded = general_purpose::STANDARD.encode(basic);
@@ -69,7 +71,9 @@ impl DatabaseClient {
None
};
if let Some(ref proxy) = proxy {
if no_proxy {
common_telemetry::info!("Proxy disabled");
} else if let Some(ref proxy) = proxy {
common_telemetry::info!("Using proxy: {:?}", proxy);
} else {
common_telemetry::info!("Using system proxy(if any)");
@@ -81,9 +85,14 @@ impl DatabaseClient {
auth_header,
timeout,
proxy,
no_proxy,
}
}
pub fn addr(&self) -> &str {
&self.addr
}
pub async fn sql_in_public(&self, sql: &str) -> Result<Option<Vec<Vec<Value>>>> {
self.sql(sql, DEFAULT_SCHEMA_NAME).await
}
@@ -95,12 +104,14 @@ impl DatabaseClient {
("db", format!("{}-{}", self.catalog, schema)),
("sql", sql.to_string()),
];
let client = self
.proxy
.clone()
.map(|proxy| reqwest::Client::builder().proxy(proxy).build())
.unwrap_or_else(|| Ok(reqwest::Client::new()))
.context(BuildClientSnafu)?;
let mut builder = reqwest::Client::builder();
if let Some(proxy) = self.proxy.clone() {
builder = builder.proxy(proxy);
}
if self.no_proxy {
builder = builder.no_proxy();
}
let client = builder.build().context(BuildClientSnafu)?;
let mut request = client
.post(&url)
.form(&params)

View File

@@ -29,7 +29,7 @@ pub use database::DatabaseClient;
use error::Result;
pub use crate::bench::BenchTableMetadataCommand;
pub use crate::data::DataCommand;
pub use crate::data::{DataCommand, export_v2, import_v2};
pub use crate::metadata::MetadataCommand;
#[async_trait]

View File

@@ -21,7 +21,7 @@ use clap::Subcommand;
use common_error::ext::BoxedError;
use crate::Tool;
use crate::metadata::control::{DelCommand, GetCommand};
use crate::metadata::control::{DelCommand, GetCommand, PutCommand};
use crate::metadata::repair::RepairCommand;
use crate::metadata::snapshot::SnapshotCommand;
@@ -37,6 +37,8 @@ pub enum MetadataCommand {
#[clap(subcommand)]
Del(DelCommand),
#[clap(subcommand)]
Put(PutCommand),
#[clap(subcommand)]
Repair(RepairCommand),
}
@@ -47,6 +49,7 @@ impl MetadataCommand {
MetadataCommand::Repair(cmd) => cmd.build().await,
MetadataCommand::Get(cmd) => cmd.build().await,
MetadataCommand::Del(cmd) => cmd.build().await,
MetadataCommand::Put(cmd) => cmd.build().await,
}
}
}

View File

@@ -14,9 +14,12 @@
mod del;
mod get;
mod put;
mod selector;
#[cfg(test)]
mod test_utils;
mod utils;
pub(crate) use del::DelCommand;
pub(crate) use get::GetCommand;
pub(crate) use put::PutCommand;

View File

@@ -14,111 +14,59 @@
use async_trait::async_trait;
use clap::Parser;
use client::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_catalog::format_full_table_name;
use common_error::ext::BoxedError;
use common_meta::ddl::utils::get_region_wal_options;
use common_meta::key::TableMetadataManager;
use common_meta::key::table_name::TableNameManager;
use common_meta::kv_backend::KvBackendRef;
use store_api::storage::TableId;
use crate::Tool;
use crate::common::StoreConfig;
use crate::error::{InvalidArgumentsSnafu, TableNotFoundSnafu};
use crate::error::TableNotFoundSnafu;
use crate::metadata::control::del::CLI_TOMBSTONE_PREFIX;
use crate::metadata::control::utils::get_table_id_by_name;
use crate::metadata::control::selector::TableSelector;
/// Delete table metadata logically from the metadata store.
#[derive(Debug, Default, Parser)]
pub struct DelTableCommand {
/// The table id to delete from the metadata store.
#[clap(long)]
table_id: Option<u32>,
/// The table name to delete from the metadata store.
#[clap(long)]
table_name: Option<String>,
/// The schema name of the table.
#[clap(long, default_value = DEFAULT_SCHEMA_NAME)]
schema_name: String,
/// The catalog name of the table.
#[clap(long, default_value = DEFAULT_CATALOG_NAME)]
catalog_name: String,
#[clap(flatten)]
selector: TableSelector,
/// The store config.
#[clap(flatten)]
store: StoreConfig,
}
impl DelTableCommand {
fn validate(&self) -> Result<(), BoxedError> {
if matches!(
(&self.table_id, &self.table_name),
(Some(_), Some(_)) | (None, None)
) {
return Err(BoxedError::new(
InvalidArgumentsSnafu {
msg: "You must specify either --table-id or --table-name.",
}
.build(),
));
}
Ok(())
}
}
impl DelTableCommand {
pub async fn build(&self) -> Result<Box<dyn Tool>, BoxedError> {
self.validate()?;
self.selector.validate()?;
let kv_backend = self.store.build().await?;
Ok(Box::new(DelTableTool {
table_id: self.table_id,
table_name: self.table_name.clone(),
schema_name: self.schema_name.clone(),
catalog_name: self.catalog_name.clone(),
table_name_manager: TableNameManager::new(kv_backend.clone()),
selector: self.selector.clone(),
table_metadata_deleter: TableMetadataDeleter::new(kv_backend),
}))
}
}
struct DelTableTool {
table_id: Option<u32>,
table_name: Option<String>,
schema_name: String,
catalog_name: String,
table_name_manager: TableNameManager,
selector: TableSelector,
table_metadata_deleter: TableMetadataDeleter,
}
#[async_trait]
impl Tool for DelTableTool {
async fn do_work(&self) -> Result<(), BoxedError> {
let table_id = if let Some(table_name) = &self.table_name {
let catalog_name = &self.catalog_name;
let schema_name = &self.schema_name;
let Some(table_id) = get_table_id_by_name(
&self.table_name_manager,
catalog_name,
schema_name,
table_name,
let Some(table_id) = self
.selector
.resolve_table_id(
self.table_metadata_deleter
.table_metadata_manager
.table_name_manager(),
)
.await?
else {
println!(
"Table({}) not found",
format_full_table_name(catalog_name, schema_name, table_name)
);
return Ok(());
};
table_id
} else {
// Safety: we have validated that table_id or table_name is not None
self.table_id.unwrap()
else {
println!("Table({}) not found", self.selector.formatted_table_name());
return Ok(());
};
self.table_metadata_deleter.delete(table_id).await?;
println!("Table({}) deleted", table_id);
@@ -182,6 +130,7 @@ mod tests {
use std::collections::HashMap;
use std::sync::Arc;
use clap::Parser;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_meta::key::TableMetadataManager;
@@ -192,9 +141,83 @@ mod tests {
use common_meta::rpc::store::RangeRequest;
use crate::metadata::control::del::CLI_TOMBSTONE_PREFIX;
use crate::metadata::control::del::table::TableMetadataDeleter;
use crate::metadata::control::del::table::{DelTableCommand, TableMetadataDeleter};
use crate::metadata::control::test_utils::prepare_physical_table_metadata;
#[tokio::test]
async fn test_del_table_selector_requires_single_target() {
let command = DelTableCommand::parse_from([
"table",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let err = match command.build().await {
Ok(_) => panic!("expected validation failure"),
Err(err) => err,
};
assert!(
err.output_msg()
.contains("You must specify either --table-id or --table-name.")
);
}
#[tokio::test]
async fn test_del_table_selector_rejects_both_targets() {
let command = DelTableCommand::parse_from([
"table",
"--table-id",
"1024",
"--table-name",
"my_table",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let err = match command.build().await {
Ok(_) => panic!("expected validation failure"),
Err(err) => err,
};
assert!(
err.output_msg()
.contains("You must specify either --table-id or --table-name.")
);
}
#[tokio::test]
async fn test_del_table_command_builds_tool_with_table_id() {
let command = DelTableCommand::parse_from([
"table",
"--table-id",
"1024",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let _tool = command.build().await.unwrap();
}
#[tokio::test]
async fn test_del_table_command_builds_tool_with_table_name() {
let command = DelTableCommand::parse_from([
"table",
"--table-name",
"my_table",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let _tool = command.build().await.unwrap();
}
#[tokio::test]
async fn test_delete_table_not_found() {
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;

View File

@@ -16,8 +16,6 @@ use std::cmp::min;
use async_trait::async_trait;
use clap::{Parser, Subcommand};
use client::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_catalog::format_full_table_name;
use common_error::ext::BoxedError;
use common_meta::key::TableMetadataManager;
use common_meta::key::table_info::TableInfoKey;
@@ -29,8 +27,8 @@ use futures::TryStreamExt;
use crate::Tool;
use crate::common::StoreConfig;
use crate::error::InvalidArgumentsSnafu;
use crate::metadata::control::utils::{decode_key_value, get_table_id_by_name, json_formatter};
use crate::metadata::control::selector::TableSelector;
use crate::metadata::control::utils::{decode_key_value, json_formatter};
/// Getting metadata from metadata store.
#[derive(Subcommand)]
@@ -120,21 +118,8 @@ impl Tool for GetKeyTool {
/// Get table metadata from the metadata store via table id.
#[derive(Debug, Default, Parser)]
pub struct GetTableCommand {
/// Get table metadata by table id.
#[clap(long)]
table_id: Option<u32>,
/// Get table metadata by table name.
#[clap(long)]
table_name: Option<String>,
/// The schema name of the table.
#[clap(long, default_value = DEFAULT_SCHEMA_NAME)]
schema_name: String,
/// The catalog name of the table.
#[clap(long, default_value = DEFAULT_CATALOG_NAME)]
catalog_name: String,
#[clap(flatten)]
selector: TableSelector,
/// Pretty print the output.
#[clap(long, default_value = "false")]
@@ -144,29 +129,9 @@ pub struct GetTableCommand {
store: StoreConfig,
}
impl GetTableCommand {
pub fn validate(&self) -> Result<(), BoxedError> {
if matches!(
(&self.table_id, &self.table_name),
(Some(_), Some(_)) | (None, None)
) {
return Err(BoxedError::new(
InvalidArgumentsSnafu {
msg: "You must specify either --table-id or --table-name.",
}
.build(),
));
}
Ok(())
}
}
struct GetTableTool {
kvbackend: KvBackendRef,
table_id: Option<u32>,
table_name: Option<String>,
schema_name: String,
catalog_name: String,
selector: TableSelector,
pretty: bool,
}
@@ -178,24 +143,9 @@ impl Tool for GetTableTool {
let table_info_manager = table_metadata_manager.table_info_manager();
let table_route_manager = table_metadata_manager.table_route_manager();
let table_id = if let Some(table_name) = &self.table_name {
let catalog_name = &self.catalog_name;
let schema_name = &self.schema_name;
let Some(table_id) =
get_table_id_by_name(table_name_manager, catalog_name, schema_name, table_name)
.await?
else {
println!(
"Table({}) not found",
format_full_table_name(catalog_name, schema_name, table_name)
);
return Ok(());
};
table_id
} else {
// Safety: we have validated that table_id or table_name is not None
self.table_id.unwrap()
let Some(table_id) = self.selector.resolve_table_id(table_name_manager).await? else {
println!("Table({}) not found", self.selector.formatted_table_name());
return Ok(());
};
let table_info = table_info_manager
@@ -233,15 +183,94 @@ impl Tool for GetTableTool {
impl GetTableCommand {
pub async fn build(&self) -> Result<Box<dyn Tool>, BoxedError> {
self.validate()?;
self.selector.validate()?;
let kvbackend = self.store.build().await?;
Ok(Box::new(GetTableTool {
kvbackend,
table_id: self.table_id,
table_name: self.table_name.clone(),
schema_name: self.schema_name.clone(),
catalog_name: self.catalog_name.clone(),
selector: self.selector.clone(),
pretty: self.pretty,
}))
}
}
#[cfg(test)]
mod tests {
use clap::Parser;
use common_error::ext::ErrorExt;
use super::GetTableCommand;
#[tokio::test]
async fn test_get_table_selector_requires_single_target() {
let command = GetTableCommand::parse_from([
"table",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let err = match command.build().await {
Ok(_) => panic!("expected validation failure"),
Err(err) => err,
};
assert!(
err.output_msg()
.contains("You must specify either --table-id or --table-name.")
);
}
#[tokio::test]
async fn test_get_table_selector_rejects_both_targets() {
let command = GetTableCommand::parse_from([
"table",
"--table-id",
"1024",
"--table-name",
"my_table",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let err = match command.build().await {
Ok(_) => panic!("expected validation failure"),
Err(err) => err,
};
assert!(
err.output_msg()
.contains("You must specify either --table-id or --table-name.")
);
}
#[tokio::test]
async fn test_get_table_command_builds_tool_with_table_id() {
let command = GetTableCommand::parse_from([
"table",
"--table-id",
"1024",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let _tool = command.build().await.unwrap();
}
#[tokio::test]
async fn test_get_table_command_builds_tool_with_table_name() {
let command = GetTableCommand::parse_from([
"table",
"--table-name",
"my_table",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let _tool = command.build().await.unwrap();
}
}

View File

@@ -0,0 +1,56 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
mod key;
mod table;
use clap::Subcommand;
use common_error::ext::BoxedError;
use snafu::ResultExt;
use tokio::io::{AsyncRead, AsyncReadExt};
use crate::Tool;
use crate::error::FileIoSnafu;
use crate::metadata::control::put::key::PutKeyCommand;
use crate::metadata::control::put::table::PutTableCommand;
pub(crate) async fn read_value<R>(mut reader: R) -> Result<Vec<u8>, BoxedError>
where
R: AsyncRead + Unpin,
{
let mut value = Vec::new();
reader
.read_to_end(&mut value)
.await
.context(FileIoSnafu)
.map_err(BoxedError::new)?;
Ok(value)
}
/// Subcommand for putting metadata into the metadata store.
#[derive(Subcommand)]
pub enum PutCommand {
Key(PutKeyCommand),
#[clap(subcommand)]
Table(PutTableCommand),
}
impl PutCommand {
pub async fn build(&self) -> Result<Box<dyn Tool>, BoxedError> {
match self {
PutCommand::Key(cmd) => cmd.build().await,
PutCommand::Table(cmd) => cmd.build().await,
}
}
}

View File

@@ -0,0 +1,444 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use async_trait::async_trait;
use clap::Parser;
use common_error::ext::BoxedError;
use common_meta::key::catalog_name::{CatalogNameKey, CatalogNameValue};
use common_meta::key::flow::flow_state::FlowStateValue;
use common_meta::key::flow::{
flow_info_key_prefix, flow_name_key_prefix, flow_route_key_prefix, flow_state_full_key,
flownode_flow_key_prefix, table_flow_key_prefix,
};
use common_meta::key::node_address::{NodeAddressKey, NodeAddressValue};
use common_meta::key::schema_name::{SchemaNameKey, SchemaNameValue};
use common_meta::key::table_repart::{TableRepartKey, TableRepartValue};
use common_meta::key::topic_name::{TopicNameKey, TopicNameValue};
use common_meta::key::topic_region::{TopicRegionKey, TopicRegionValue};
use common_meta::key::view_info::{ViewInfoKey, ViewInfoValue};
use common_meta::key::{
CATALOG_NAME_KEY_PREFIX, DATANODE_TABLE_KEY_PREFIX, KAFKA_TOPIC_KEY_PREFIX, MetadataKey,
MetadataValue, NODE_ADDRESS_PREFIX, SCHEMA_NAME_KEY_PREFIX, TABLE_INFO_KEY_PREFIX,
TABLE_NAME_KEY_PREFIX, TABLE_REPART_PREFIX, TABLE_ROUTE_PREFIX, TOPIC_REGION_PREFIX,
VIEW_INFO_KEY_PREFIX,
};
use common_meta::kv_backend::KvBackendRef;
use common_meta::rpc::store::PutRequest;
use crate::Tool;
use crate::common::StoreConfig;
use crate::error::InvalidArgumentsSnafu;
use crate::metadata::control::put::read_value;
/// Put a key-value pair into the metadata store.
#[derive(Debug, Default, Parser)]
pub struct PutKeyCommand {
/// The key to put into the metadata store.
key: String,
/// Read the value to put into the metadata store from standard input.
#[clap(long, required = true)]
value_stdin: bool,
/// Skip metadata validation before writing.
#[clap(long)]
no_validate: bool,
#[clap(flatten)]
store: StoreConfig,
}
impl PutKeyCommand {
pub async fn build(&self) -> Result<Box<dyn Tool>, BoxedError> {
let kv_backend = self.store.build().await?;
self.build_tool(tokio::io::stdin(), kv_backend).await
}
async fn build_tool<R>(
&self,
reader: R,
kv_backend: KvBackendRef,
) -> Result<Box<dyn Tool>, BoxedError>
where
R: tokio::io::AsyncRead + Unpin,
{
Ok(Box::new(PutKeyTool {
kv_backend,
key: self.key.clone(),
value: read_value(reader).await?,
no_validate: self.no_validate,
}))
}
}
struct PutKeyTool {
kv_backend: KvBackendRef,
key: String,
value: Vec<u8>,
no_validate: bool,
}
#[async_trait]
impl Tool for PutKeyTool {
async fn do_work(&self) -> Result<(), BoxedError> {
if !self.no_validate {
validate_metadata_value(&self.key, &self.value)?;
}
let request = PutRequest::new()
.with_key(self.key.as_bytes())
.with_value(self.value.clone());
self.kv_backend
.put(request)
.await
.map_err(BoxedError::new)?;
println!("Key({}) updated", self.key);
Ok(())
}
}
fn validate_metadata_value(key: &str, value: &[u8]) -> Result<(), BoxedError> {
if let Some(reason) = unsupported_direct_put_reason(key) {
return Err(BoxedError::new(
InvalidArgumentsSnafu {
msg: format!("{reason}, use --no-validate to bypass"),
}
.build(),
));
}
if key == flow_state_full_key() {
validate_value(key, value, FlowStateValue::try_from_raw_value)?;
return Ok(());
} else if matches_key_prefix(key, VIEW_INFO_KEY_PREFIX) {
validate_key(ViewInfoKey::from_bytes(key.as_bytes()), key)?;
validate_value(key, value, ViewInfoValue::try_from_raw_value)?;
return Ok(());
} else if matches_key_prefix(key, CATALOG_NAME_KEY_PREFIX) {
validate_key(CatalogNameKey::from_bytes(key.as_bytes()), key)?;
validate_value(key, value, CatalogNameValue::try_from_raw_value)?;
return Ok(());
} else if matches_key_prefix(key, SCHEMA_NAME_KEY_PREFIX) {
validate_key(SchemaNameKey::from_bytes(key.as_bytes()), key)?;
validate_value(key, value, SchemaNameValue::try_from_raw_value)?;
return Ok(());
} else if matches_key_prefix(key, TABLE_REPART_PREFIX) {
validate_key(TableRepartKey::from_bytes(key.as_bytes()), key)?;
validate_value(key, value, TableRepartValue::try_from_raw_value)?;
return Ok(());
} else if matches_key_prefix(key, NODE_ADDRESS_PREFIX) {
validate_key(NodeAddressKey::from_bytes(key.as_bytes()), key)?;
validate_value(key, value, NodeAddressValue::try_from_raw_value)?;
return Ok(());
} else if matches_key_prefix(key, KAFKA_TOPIC_KEY_PREFIX) {
validate_key(TopicNameKey::from_bytes(key.as_bytes()), key)?;
validate_value(key, value, TopicNameValue::try_from_raw_value)?;
return Ok(());
} else if matches_key_prefix(key, TOPIC_REGION_PREFIX) {
validate_key(TopicRegionKey::from_bytes(key.as_bytes()), key)?;
validate_value(key, value, TopicRegionValue::try_from_raw_value)?;
return Ok(());
}
Err(BoxedError::new(
InvalidArgumentsSnafu {
msg: format!(
"Unsupported metadata key for validation: {key}, use --no-validate to bypass"
),
}
.build(),
))
}
/// Returns the rejection reason for keys that should not be updated by `put key`.
///
/// These keys may be decodable, but they are not safe to update via raw KV writes.
/// `__table_route/*` is the canonical example.
fn unsupported_direct_put_reason(key: &str) -> Option<String> {
let flow_info_prefix = flow_info_key_prefix();
let flow_name_prefix = flow_name_key_prefix();
let flow_route_prefix = flow_route_key_prefix();
let table_flow_prefix = table_flow_key_prefix();
let flownode_flow_prefix = flownode_flow_key_prefix();
let (prefix, target) = [
(TABLE_ROUTE_PREFIX, "table route metadata"),
(TABLE_INFO_KEY_PREFIX, "table info metadata"),
(TABLE_NAME_KEY_PREFIX, "table name metadata"),
(DATANODE_TABLE_KEY_PREFIX, "datanode table metadata"),
(&flow_info_prefix, "flow info metadata"),
(&flow_name_prefix, "flow name metadata"),
(&flow_route_prefix, "flow route metadata"),
(&table_flow_prefix, "flow source table metadata"),
(&flownode_flow_prefix, "flownode flow metadata"),
]
.into_iter()
.find(|(prefix, _)| matches_key_prefix(key, prefix))?;
Some(format!(
"Direct put is not supported for {target} ({prefix}*); use a dedicated metadata update interface instead"
))
}
fn matches_key_prefix(key: &str, prefix: &str) -> bool {
key == prefix
|| key
.strip_prefix(prefix)
.is_some_and(|rest| rest.starts_with('/'))
}
fn validate_value<T, F>(key: &str, value: &[u8], parser: F) -> Result<(), BoxedError>
where
F: FnOnce(&[u8]) -> common_meta::error::Result<T>,
{
parser(value).map_err(|e| {
BoxedError::new(
InvalidArgumentsSnafu {
msg: format!("Invalid metadata value for key: {key}: {e}"),
}
.build(),
)
})?;
Ok(())
}
fn validate_key<T>(result: common_meta::error::Result<T>, key: &str) -> Result<(), BoxedError> {
result.map_err(|e| {
BoxedError::new(
InvalidArgumentsSnafu {
msg: format!("Invalid metadata key: {key}: {e}"),
}
.build(),
)
})?;
Ok(())
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use std::sync::Arc;
use clap::Parser;
use common_error::ext::{BoxedError, ErrorExt};
use common_meta::key::flow::flow_state::FlowStateValue;
use common_meta::key::flow::flow_state_full_key;
use common_meta::key::schema_name::SchemaNameValue;
use common_meta::key::topic_name::TopicNameValue;
use common_meta::key::{KAFKA_TOPIC_KEY_PREFIX, MetadataValue, SCHEMA_NAME_KEY_PREFIX};
use common_meta::kv_backend::KvBackendRef;
use common_meta::kv_backend::memory::MemoryKvBackend;
use tokio::io::BufReader;
use super::{
PutKeyCommand, PutKeyTool, TABLE_ROUTE_PREFIX, matches_key_prefix,
unsupported_direct_put_reason, validate_metadata_value,
};
use crate::Tool;
impl PutKeyCommand {
async fn build_for_test<R>(
&self,
reader: R,
kv_backend: KvBackendRef,
) -> Result<Box<dyn Tool>, BoxedError>
where
R: tokio::io::AsyncRead + Unpin,
{
self.build_tool(reader, kv_backend).await
}
}
#[test]
fn test_validate_supported_key_success() {
let value = SchemaNameValue::default().try_as_raw_value().unwrap();
validate_metadata_value(&format!("{SCHEMA_NAME_KEY_PREFIX}/greptime/public"), &value)
.unwrap();
}
#[test]
fn test_validate_supported_key_invalid_value() {
let err = validate_metadata_value(
&format!("{KAFKA_TOPIC_KEY_PREFIX}/test-topic"),
b"not-a-valid-json-value",
)
.unwrap_err();
assert!(err.output_msg().contains("Invalid metadata value for key"));
}
#[test]
fn test_validate_complex_key_fails() {
let value = serde_json::to_vec(&BTreeMap::<u32, u32>::new()).unwrap();
let err =
validate_metadata_value(&format!("{TABLE_ROUTE_PREFIX}/1024"), &value).unwrap_err();
assert!(
err.output_msg()
.contains("Direct put is not supported for table route metadata")
);
}
#[test]
fn test_validate_unknown_key_fails() {
let err = validate_metadata_value("__unknown/foo", b"{}").unwrap_err();
assert!(
err.output_msg()
.contains("Unsupported metadata key for validation")
);
}
#[test]
fn test_validate_invalid_supported_key_fails() {
let value = SchemaNameValue::default().try_as_raw_value().unwrap();
let err = validate_metadata_value("__schema_name/greptime", &value).unwrap_err();
assert!(
err.output_msg()
.contains("Invalid metadata key: __schema_name/greptime")
);
}
#[test]
fn test_unsupported_direct_put_reason_covers_complex_keys() {
let cases = [
"__table_route/1024",
"__table_info/1024",
"__table_name/greptime/public/demo",
"__dn_table/1/1024",
"__flow/route/1/1",
];
for key in cases {
assert!(unsupported_direct_put_reason(key).is_some(), "key: {key}");
}
}
#[test]
fn test_matches_key_prefix() {
assert!(matches_key_prefix("__table_route", "__table_route"));
assert!(matches_key_prefix("__table_route/1024", "__table_route"));
assert!(!matches_key_prefix(
"__table_route_extra/1024",
"__table_route"
));
assert!(!matches_key_prefix("__table_routex", "__table_route"));
assert!(!matches_key_prefix(
"__topic_name/kafka_backup/foo",
"__topic_name/kafka"
));
}
#[test]
fn test_validate_exact_flow_state_key() {
let value = FlowStateValue::new(BTreeMap::new(), BTreeMap::new())
.try_as_raw_value()
.unwrap();
validate_metadata_value(&flow_state_full_key(), &value).unwrap();
}
#[tokio::test]
async fn test_put_key_tool_writes_supported_key() {
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let value = TopicNameValue::new(42).try_as_raw_value().unwrap();
let key = format!("{KAFKA_TOPIC_KEY_PREFIX}/test-topic");
let tool = PutKeyTool {
kv_backend: kv_backend.clone(),
key: key.clone(),
value: value.clone(),
no_validate: false,
};
tool.do_work().await.unwrap();
let stored = kv_backend.get(key.as_bytes()).await.unwrap().unwrap();
assert_eq!(stored.value, value);
}
#[tokio::test]
async fn test_put_key_tool_bypasses_validation() {
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let key = format!("{TABLE_ROUTE_PREFIX}/1024");
let value = b"not-json".to_vec();
let tool = PutKeyTool {
kv_backend: kv_backend.clone(),
key: key.clone(),
value: value.clone(),
no_validate: true,
};
tool.do_work().await.unwrap();
let stored = kv_backend.get(key.as_bytes()).await.unwrap().unwrap();
assert_eq!(stored.value, value);
}
#[test]
fn test_put_key_command_requires_value_stdin() {
let err = PutKeyCommand::try_parse_from([
"key",
"__topic_name/kafka/test-cli-topic",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
])
.unwrap_err();
assert_eq!(err.kind(), clap::error::ErrorKind::MissingRequiredArgument);
}
#[tokio::test]
async fn test_put_key_command_builds_tool_with_stdin() {
let value = TopicNameValue::new(7).try_as_raw_value().unwrap();
let command = PutKeyCommand::parse_from([
"key",
"__topic_name/kafka/test-cli-topic",
"--value-stdin",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let tool = command
.build_for_test(
BufReader::new(value.as_slice()),
Arc::new(MemoryKvBackend::new()) as KvBackendRef,
)
.await
.unwrap();
tool.do_work().await.unwrap();
}
#[tokio::test]
async fn test_put_key_command_validate_failure() {
let tool = PutKeyTool {
kv_backend: Arc::new(MemoryKvBackend::new()) as KvBackendRef,
key: "__table_route/1024".to_string(),
value: b"{}".to_vec(),
no_validate: false,
};
let err = tool.do_work().await.unwrap_err();
assert!(
err.output_msg()
.contains("Direct put is not supported for table route metadata")
);
}
}

View File

@@ -0,0 +1,687 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use async_trait::async_trait;
use clap::{Parser, Subcommand};
use common_error::ext::BoxedError;
use common_meta::key::datanode_table::{DatanodeTableKey, RegionInfo};
use common_meta::key::table_info::TableInfoValue;
use common_meta::key::table_route::TableRouteValue;
use common_meta::key::{
DeserializedValueWithBytes, MetadataValue, RegionDistribution, TableMetadataManager,
};
use common_meta::kv_backend::KvBackendRef;
use common_meta::rpc::router::{RegionRoute, region_distribution};
use snafu::{OptionExt, ensure};
use store_api::storage::TableId;
use table::metadata::TableInfo;
use crate::Tool;
use crate::common::StoreConfig;
use crate::error::{Error, InvalidArgumentsSnafu, TableNotFoundSnafu, UnexpectedSnafu};
use crate::metadata::control::put::read_value;
use crate::metadata::control::selector::TableSelector;
/// Put table metadata into the metadata store.
#[derive(Subcommand)]
pub enum PutTableCommand {
Info(PutTableInfoCommand),
Route(PutTableRouteCommand),
}
impl PutTableCommand {
pub async fn build(&self) -> Result<Box<dyn Tool>, BoxedError> {
match self {
PutTableCommand::Info(cmd) => cmd.build().await,
PutTableCommand::Route(cmd) => cmd.build().await,
}
}
}
/// Put table info into the metadata store.
#[derive(Debug, Parser)]
pub struct PutTableInfoCommand {
#[clap(flatten)]
selector: TableSelector,
/// Read the JSON-encoded [`TableInfoValue`] from standard input.
#[clap(long, required = true)]
value_stdin: bool,
#[clap(flatten)]
store: StoreConfig,
}
impl PutTableInfoCommand {
pub async fn build(&self) -> Result<Box<dyn Tool>, BoxedError> {
let kv_backend = self.store.build().await?;
self.build_tool(tokio::io::stdin(), kv_backend).await
}
async fn build_tool<R>(
&self,
reader: R,
kv_backend: KvBackendRef,
) -> Result<Box<dyn Tool>, BoxedError>
where
R: tokio::io::AsyncRead + Unpin,
{
self.selector.validate()?;
Ok(Box::new(PutTableInfoTool {
kv_backend,
selector: self.selector.clone(),
value: read_value(reader).await?,
}))
}
}
struct PutTableInfoTool {
kv_backend: KvBackendRef,
selector: TableSelector,
value: Vec<u8>,
}
#[async_trait]
impl Tool for PutTableInfoTool {
async fn do_work(&self) -> Result<(), BoxedError> {
let table_metadata_manager = TableMetadataManager::new(self.kv_backend.clone());
let Some(table_id) = self
.selector
.resolve_table_id(table_metadata_manager.table_name_manager())
.await?
else {
return Err(BoxedError::new(
UnexpectedSnafu {
msg: format!("Table({}) not found", self.selector.formatted_table_name()),
}
.build(),
));
};
let (current_table_info, current_table_route) =
load_table_metadata(&table_metadata_manager, table_id).await?;
let new_table_info = TableInfoValue::try_from_raw_value(&self.value)
.map_err(|e| {
BoxedError::new(
InvalidArgumentsSnafu {
msg: format!("Invalid table info JSON: {e}"),
}
.build(),
)
})?
.table_info;
validate_table_info(table_id, &current_table_info.table_info, &new_table_info)
.map_err(BoxedError::new)?;
let region_distribution =
physical_region_distribution(current_table_route.get_inner_ref())?;
if current_table_info.table_info != new_table_info {
table_metadata_manager
.update_table_info(&current_table_info, region_distribution, new_table_info)
.await
.map_err(BoxedError::new)?;
println!("Table({table_id}) info updated");
}
Ok(())
}
}
/// Put table route into the metadata store.
#[derive(Debug, Parser)]
pub struct PutTableRouteCommand {
#[clap(flatten)]
selector: TableSelector,
/// Read the JSON-encoded [`TableRouteValue`] from standard input.
#[clap(long, required = true)]
value_stdin: bool,
#[clap(flatten)]
store: StoreConfig,
}
impl PutTableRouteCommand {
pub async fn build(&self) -> Result<Box<dyn Tool>, BoxedError> {
let kv_backend = self.store.build().await?;
self.build_tool(tokio::io::stdin(), kv_backend).await
}
async fn build_tool<R>(
&self,
reader: R,
kv_backend: KvBackendRef,
) -> Result<Box<dyn Tool>, BoxedError>
where
R: tokio::io::AsyncRead + Unpin,
{
self.selector.validate()?;
Ok(Box::new(PutTableRouteTool {
kv_backend,
selector: self.selector.clone(),
value: read_value(reader).await?,
}))
}
}
struct PutTableRouteTool {
kv_backend: KvBackendRef,
selector: TableSelector,
value: Vec<u8>,
}
#[async_trait]
impl Tool for PutTableRouteTool {
async fn do_work(&self) -> Result<(), BoxedError> {
let table_metadata_manager = TableMetadataManager::new(self.kv_backend.clone());
let Some(table_id) = self
.selector
.resolve_table_id(table_metadata_manager.table_name_manager())
.await?
else {
return Err(BoxedError::new(
UnexpectedSnafu {
msg: format!("Table({}) not found", self.selector.formatted_table_name()),
}
.build(),
));
};
let (current_table_info, current_table_route) =
load_table_metadata(&table_metadata_manager, table_id).await?;
let current_region_routes = current_table_route
.region_routes()
.map_err(BoxedError::new)?;
let new_table_route = TableRouteValue::try_from_raw_value(&self.value).map_err(|e| {
BoxedError::new(
InvalidArgumentsSnafu {
msg: format!("Invalid table route JSON: {e}"),
}
.build(),
)
})?;
let new_region_routes = new_table_route.region_routes().map_err(BoxedError::new)?;
validate_table_route(table_id, new_region_routes, current_region_routes)
.map_err(BoxedError::new)?;
let region_info =
load_region_info(&table_metadata_manager, table_id, current_region_routes).await?;
let new_region_options = current_table_info.table_info.to_region_options();
let new_region_wal_options = region_info.region_wal_options.clone();
if current_table_route.get_inner_ref() != &new_table_route {
table_metadata_manager
.update_table_route(
table_id,
region_info,
&current_table_route,
new_region_routes.clone(),
&new_region_options,
&new_region_wal_options,
)
.await
.map_err(BoxedError::new)?;
println!("Table({table_id}) route updated");
}
Ok(())
}
}
fn validate_table_route(
table_id: TableId,
new_region_routes: &[RegionRoute],
current_region_route: &[RegionRoute],
) -> Result<(), Error> {
let current_region_ids = current_region_route
.iter()
.map(|r| r.region.id)
.collect::<HashSet<_>>();
for route in new_region_routes {
ensure!(
route.region.id.table_id() == table_id,
InvalidArgumentsSnafu {
msg: format!(
"Invalid table route: all region routes must have table id {table_id}, but got {}",
route.region.id.table_id()
),
}
);
// Ensure the region in new route exists in current route
current_region_ids
.contains(&route.region.id)
.then_some(())
.context(InvalidArgumentsSnafu {
msg: format!(
"Invalid table route: region {} does not exist in current routes",
route.region.id
),
})?;
}
Ok(())
}
fn validate_table_info(
table_id: TableId,
current_table_info: &TableInfo,
new_table_info: &TableInfo,
) -> Result<(), Error> {
ensure!(
new_table_info.ident.table_id == table_id,
InvalidArgumentsSnafu {
msg: format!(
"Invalid table info: expected table id {table_id}, got {}",
new_table_info.ident.table_id
),
}
);
ensure!(
current_table_info.catalog_name == new_table_info.catalog_name,
InvalidArgumentsSnafu {
msg: format!(
"Invalid table info: catalog name is immutable, expected {}, got {}",
current_table_info.catalog_name, new_table_info.catalog_name
),
}
);
ensure!(
current_table_info.schema_name == new_table_info.schema_name,
InvalidArgumentsSnafu {
msg: format!(
"Invalid table info: schema name is immutable, expected {}, got {}",
current_table_info.schema_name, new_table_info.schema_name
),
}
);
ensure!(
current_table_info.name == new_table_info.name,
InvalidArgumentsSnafu {
msg: format!(
"Invalid table info: table name is immutable, expected {}, got {}",
current_table_info.name, new_table_info.name
),
}
);
Ok(())
}
async fn load_region_info(
table_metadata_manager: &TableMetadataManager,
table_id: TableId,
region_routes: &[RegionRoute],
) -> Result<RegionInfo, BoxedError> {
let datanode_id = region_distribution(region_routes)
.into_keys()
.next()
.ok_or_else(|| {
BoxedError::new(
UnexpectedSnafu {
msg: format!(
"Missing datanode assignment for physical table route: {table_id}"
),
}
.build(),
)
})?;
table_metadata_manager
.datanode_table_manager()
.get(&DatanodeTableKey::new(datanode_id, table_id))
.await
.map_err(BoxedError::new)?
.map(|value| value.region_info)
.ok_or_else(|| {
BoxedError::new(
UnexpectedSnafu {
msg: format!(
"Missing datanode table metadata for physical table route: {table_id}"
),
}
.build(),
)
})
}
async fn load_table_metadata(
table_metadata_manager: &TableMetadataManager,
table_id: TableId,
) -> Result<
(
DeserializedValueWithBytes<TableInfoValue>,
DeserializedValueWithBytes<TableRouteValue>,
),
BoxedError,
> {
let (table_info, table_route) = table_metadata_manager
.get_full_table_info(table_id)
.await
.map_err(BoxedError::new)?;
let table_info =
table_info.ok_or_else(|| BoxedError::new(TableNotFoundSnafu { table_id }.build()))?;
let table_route =
table_route.ok_or_else(|| BoxedError::new(TableNotFoundSnafu { table_id }.build()))?;
Ok((table_info, table_route))
}
fn physical_region_distribution(
table_route: &TableRouteValue,
) -> Result<Option<RegionDistribution>, BoxedError> {
if !table_route.is_physical() {
return Ok(None);
}
table_route
.region_routes()
.map(|routes| Some(region_distribution(routes)))
.map_err(BoxedError::new)
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::sync::Arc;
use clap::Parser;
use common_error::ext::{BoxedError, ErrorExt};
use common_meta::key::TableMetadataManager;
use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableManager};
use common_meta::key::table_info::TableInfoValue;
use common_meta::key::table_route::TableRouteValue;
use common_meta::kv_backend::KvBackendRef;
use common_meta::kv_backend::memory::MemoryKvBackend;
use common_meta::peer::Peer;
use common_meta::rpc::router::RegionRoute;
use store_api::storage::RegionId;
use tokio::io::BufReader;
use super::{
PutTableInfoCommand, PutTableInfoTool, PutTableRouteCommand, PutTableRouteTool,
validate_table_route,
};
use crate::Tool;
use crate::metadata::control::selector::TableSelector;
use crate::metadata::control::test_utils::prepare_physical_table_metadata;
impl PutTableInfoCommand {
async fn build_for_test<R>(
&self,
reader: R,
kv_backend: KvBackendRef,
) -> Result<Box<dyn Tool>, BoxedError>
where
R: tokio::io::AsyncRead + Unpin,
{
self.build_tool(reader, kv_backend).await
}
}
impl PutTableRouteCommand {
async fn build_for_test<R>(
&self,
reader: R,
kv_backend: KvBackendRef,
) -> Result<Box<dyn Tool>, BoxedError>
where
R: tokio::io::AsyncRead + Unpin,
{
self.build_tool(reader, kv_backend).await
}
}
#[tokio::test]
async fn test_put_table_selector_validation() {
let command = PutTableInfoCommand::parse_from([
"info",
"--value-stdin",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let err = match command.build().await {
Ok(_) => panic!("expected validation failure"),
Err(err) => err,
};
assert!(
err.output_msg()
.contains("You must specify either --table-id or --table-name.")
);
}
#[tokio::test]
async fn test_put_table_command_builds_tool_with_table_name() {
let command = PutTableInfoCommand::parse_from([
"info",
"--table-name",
"my_table",
"--value-stdin",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let _tool = command
.build_for_test(
BufReader::new(&b"{}"[..]),
Arc::new(MemoryKvBackend::new()) as KvBackendRef,
)
.await
.unwrap();
}
#[tokio::test]
async fn test_put_table_info_rejects_table_name_change() {
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let table_metadata_manager = TableMetadataManager::new(kv_backend.clone());
let table_id = 1024;
let (table_info, table_route) =
prepare_physical_table_metadata("old_table", table_id).await;
table_metadata_manager
.create_table_metadata(
table_info.clone(),
TableRouteValue::Physical(table_route),
HashMap::new(),
)
.await
.unwrap();
let mut new_table_info = table_info;
new_table_info.name = "new_table".to_string();
let tool = PutTableInfoTool {
kv_backend: kv_backend.clone(),
selector: TableSelector::with_table_id(table_id),
value: serde_json::to_vec(&TableInfoValue::new(new_table_info)).unwrap(),
};
let err = tool.do_work().await.unwrap_err();
assert!(
err.output_msg()
.contains("Invalid table info: table name is immutable")
);
}
#[tokio::test]
async fn test_put_table_info_rejects_schema_change() {
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let table_metadata_manager = TableMetadataManager::new(kv_backend.clone());
let table_id = 1024;
let (table_info, table_route) =
prepare_physical_table_metadata("old_table", table_id).await;
table_metadata_manager
.create_table_metadata(
table_info.clone(),
TableRouteValue::Physical(table_route),
HashMap::new(),
)
.await
.unwrap();
let mut new_table_info = table_info;
new_table_info.schema_name = "another_schema".to_string();
let tool = PutTableInfoTool {
kv_backend,
selector: TableSelector::with_table_id(table_id),
value: serde_json::to_vec(&TableInfoValue::new(new_table_info)).unwrap(),
};
let err = tool.do_work().await.unwrap_err();
assert!(
err.output_msg()
.contains("Invalid table info: schema name is immutable")
);
}
#[tokio::test]
async fn test_put_table_route_updates_route_and_datanode_table() {
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let table_metadata_manager = TableMetadataManager::new(kv_backend.clone());
let table_id = 1024;
let (table_info, table_route) = prepare_physical_table_metadata("my_table", table_id).await;
table_metadata_manager
.create_table_metadata(
table_info,
TableRouteValue::Physical(table_route.clone()),
HashMap::new(),
)
.await
.unwrap();
let mut region_routes = table_route.region_routes.clone();
region_routes[0].leader_peer = Some(Peer::empty(2));
let new_table_route = TableRouteValue::physical(region_routes);
let tool = PutTableRouteTool {
kv_backend: kv_backend.clone(),
selector: TableSelector::with_table_id(table_id),
value: serde_json::to_vec(&new_table_route).unwrap(),
};
tool.do_work().await.unwrap();
let (_, current_route) = table_metadata_manager
.get_full_table_info(table_id)
.await
.unwrap();
let current_route = current_route.unwrap().into_inner();
assert_eq!(
current_route.region_routes().unwrap(),
new_table_route.region_routes().unwrap()
);
let datanode_table_manager = DatanodeTableManager::new(kv_backend);
let updated = datanode_table_manager
.get(&DatanodeTableKey::new(2, table_id))
.await
.unwrap();
assert!(updated.is_some());
}
#[tokio::test]
async fn test_put_table_route_rejects_logical_route() {
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let table_metadata_manager = TableMetadataManager::new(kv_backend.clone());
let table_id = 1024;
let (table_info, table_route) = prepare_physical_table_metadata("my_table", table_id).await;
table_metadata_manager
.create_table_metadata(
table_info,
TableRouteValue::Physical(table_route),
HashMap::new(),
)
.await
.unwrap();
let tool = PutTableRouteTool {
kv_backend,
selector: TableSelector::with_table_id(table_id),
value: serde_json::to_vec(&TableRouteValue::logical(table_id + 1)).unwrap(),
};
let err = tool.do_work().await.unwrap_err();
assert!(err.output_msg().contains("non-physical TableRouteValue."));
}
#[test]
fn test_validate_table_route_rejects_new_region_not_in_current_route() {
let table_id = 1024;
let current_region_routes = vec![
RegionRoute {
region: common_meta::rpc::router::Region {
id: RegionId::new(table_id, 1),
..Default::default()
},
..Default::default()
},
RegionRoute {
region: common_meta::rpc::router::Region {
id: RegionId::new(table_id, 2),
..Default::default()
},
..Default::default()
},
];
let new_region_routes = vec![
RegionRoute {
region: common_meta::rpc::router::Region {
id: RegionId::new(table_id, 1),
..Default::default()
},
..Default::default()
},
RegionRoute {
region: common_meta::rpc::router::Region {
id: RegionId::new(table_id, 3),
..Default::default()
},
..Default::default()
},
];
let err =
validate_table_route(table_id, &current_region_routes, &new_region_routes).unwrap_err();
assert!(err.to_string().contains("does not exist in current routes"));
}
#[tokio::test]
async fn test_put_table_command_builds_tool() {
let value = serde_json::to_vec(&TableRouteValue::logical(1025)).unwrap();
let command = PutTableRouteCommand::parse_from([
"route",
"--table-id",
"1024",
"--value-stdin",
"--backend",
"memory-store",
"--store-addrs",
"memory://",
]);
let _tool = command
.build_for_test(
BufReader::new(value.as_slice()),
Arc::new(MemoryKvBackend::new()) as KvBackendRef,
)
.await
.unwrap();
}
}

View File

@@ -0,0 +1,100 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use clap::Args;
use client::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_catalog::format_full_table_name;
use common_error::ext::BoxedError;
use common_meta::key::table_name::TableNameManager;
use store_api::storage::TableId;
use crate::error::InvalidArgumentsSnafu;
use crate::metadata::control::utils::get_table_id_by_name;
/// Selects a table by id or by fully qualified name.
#[derive(Debug, Clone, Default, Args)]
pub(crate) struct TableSelector {
/// The table id to select from the metadata store.
#[clap(long)]
table_id: Option<u32>,
/// The table name to select from the metadata store.
#[clap(long)]
table_name: Option<String>,
/// The schema name of the table.
#[clap(long, default_value = DEFAULT_SCHEMA_NAME)]
schema_name: String,
/// The catalog name of the table.
#[clap(long, default_value = DEFAULT_CATALOG_NAME)]
catalog_name: String,
}
impl TableSelector {
pub(crate) fn validate(&self) -> Result<(), BoxedError> {
if matches!(
(&self.table_id, &self.table_name),
(Some(_), Some(_)) | (None, None)
) {
return Err(BoxedError::new(
InvalidArgumentsSnafu {
msg: "You must specify either --table-id or --table-name.",
}
.build(),
));
}
Ok(())
}
pub(crate) async fn resolve_table_id(
&self,
table_name_manager: &TableNameManager,
) -> Result<Option<TableId>, BoxedError> {
if let Some(table_id) = self.table_id {
return Ok(Some(table_id));
}
get_table_id_by_name(
table_name_manager,
&self.catalog_name,
&self.schema_name,
self.table_name
.as_deref()
.expect("validated table selector"),
)
.await
}
pub(crate) fn formatted_table_name(&self) -> String {
format_full_table_name(
&self.catalog_name,
&self.schema_name,
self.table_name.as_deref().unwrap_or_default(),
)
}
}
#[cfg(test)]
impl TableSelector {
pub(crate) fn with_table_id(table_id: u32) -> Self {
Self {
table_id: Some(table_id),
table_name: None,
schema_name: DEFAULT_SCHEMA_NAME.to_string(),
catalog_name: DEFAULT_CATALOG_NAME.to_string(),
}
}
}

View File

@@ -16,7 +16,7 @@ use async_trait::async_trait;
use clap::{Parser, Subcommand};
use common_error::ext::BoxedError;
use common_meta::snapshot::MetadataSnapshotManager;
use object_store::{ObjectStore, Scheme};
use object_store::{ObjectStore, services};
use crate::Tool;
use crate::common::{ObjectStoreConfig, StoreConfig, new_fs_object_store};
@@ -276,7 +276,7 @@ fn build_object_store_and_resolve_file_path(
None => new_fs_object_store(fs_root)?,
};
let file_path = if matches!(object_store.info().scheme(), Scheme::Fs) {
let file_path = if object_store.info().scheme() == services::FS_SCHEME {
resolve_relative_path_with_current_dir(file_path).map_err(BoxedError::new)?
} else {
file_path.to_string()

View File

@@ -14,7 +14,9 @@
use std::pin::Pin;
use std::str::FromStr;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use std::task::{Context, Poll};
use api::v1::auth_header::AuthScheme;
use api::v1::ddl_request::Expr as DdlExpr;
@@ -25,6 +27,7 @@ use api::v1::{
AlterTableExpr, AuthHeader, Basic, CreateTableExpr, DdlRequest, GreptimeRequest,
InsertRequests, QueryRequest, RequestHeader, RowInsertRequests,
};
use arc_swap::ArcSwapOption;
use arrow_flight::{FlightData, Ticket};
use async_stream::stream;
use base64::Engine;
@@ -33,17 +36,18 @@ use common_catalog::build_db_string;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_error::ext::BoxedError;
use common_grpc::flight::do_put::DoPutResponse;
use common_grpc::flight::{FlightDecoder, FlightMessage};
use common_grpc::flight::{FLOW_EXTENSIONS_METADATA_KEY, FlightDecoder, FlightMessage};
use common_query::Output;
use common_recordbatch::adapter::RecordBatchMetrics;
use common_recordbatch::error::ExternalSnafu;
use common_recordbatch::{RecordBatch, RecordBatchStreamWrapper};
use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, RecordBatchStreamWrapper};
use common_telemetry::tracing::Span;
use common_telemetry::tracing_context::W3cTrace;
use common_telemetry::{error, warn};
use futures::future;
use futures_util::{Stream, StreamExt, TryStreamExt};
use prost::Message;
use snafu::{OptionExt, ResultExt, ensure};
use snafu::{OptionExt, ResultExt};
use tonic::metadata::{AsciiMetadataKey, AsciiMetadataValue, MetadataMap, MetadataValue};
use tonic::transport::Channel;
@@ -57,6 +61,313 @@ type FlightDataStream = Pin<Box<dyn Stream<Item = FlightData> + Send>>;
type DoPutResponseStream = Pin<Box<dyn Stream<Item = Result<DoPutResponse>>>>;
/// Terminal metrics associated with a query output.
///
/// For streaming outputs, metrics are only final after the stream is fully
/// drained and [`Self::is_ready`] returns `true`.
#[derive(Debug, Clone, Default)]
pub struct OutputMetrics {
inner: Arc<OutputMetricsInner>,
}
#[derive(Debug, Default)]
struct OutputMetricsInner {
metrics: RwLock<Option<RecordBatchMetrics>>,
ready: AtomicBool,
}
impl OutputMetrics {
fn new() -> Self {
Self::default()
}
/// Replaces the current terminal metrics snapshot.
pub fn update(&self, metrics: Option<RecordBatchMetrics>) {
*self.inner.metrics.write().unwrap() = metrics;
}
/// Marks the terminal metrics as final for this output.
pub fn mark_ready(&self) {
let _ = self
.inner
.ready
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire);
}
/// Returns whether terminal metrics are final.
///
/// Streaming outputs become ready only after the stream reaches EOF.
pub fn is_ready(&self) -> bool {
self.inner.ready.load(Ordering::Acquire)
}
/// Returns the latest terminal metrics snapshot, if any.
pub fn get(&self) -> Option<RecordBatchMetrics> {
self.inner.metrics.read().unwrap().clone()
}
/// Returns proved per-region watermarks.
///
/// Entries whose watermark is `None` are intentionally omitted because they
/// represent participating regions whose terminal sequence bound was not
/// provable.
pub fn region_watermark_map(&self) -> Option<std::collections::HashMap<u64, u64>> {
Some(
self.get()?
.region_watermarks
.into_iter()
.filter_map(|entry| entry.watermark.map(|seq| (entry.region_id, seq)))
.collect::<std::collections::HashMap<_, _>>(),
)
}
/// Returns all regions that participated in terminal metric collection,
/// including entries whose watermark is `None`.
pub fn participating_regions(&self) -> Option<std::collections::BTreeSet<u64>> {
Some(
self.get()?
.region_watermarks
.into_iter()
.map(|entry| entry.region_id)
.collect::<std::collections::BTreeSet<_>>(),
)
}
}
/// Query output together with a handle for its terminal metrics.
///
/// The contained [`OutputMetrics`] lets callers read stream terminal metrics
/// after consuming `output`. For non-stream outputs, metrics are ready
/// immediately.
#[derive(Debug)]
pub struct OutputWithMetrics {
pub output: Output,
pub metrics: OutputMetrics,
}
impl OutputWithMetrics {
/// Wraps an output with a terminal metrics handle.
///
/// Stream outputs update the handle as the stream is consumed. Non-stream
/// outputs are marked ready immediately.
pub fn from_output(output: Output) -> Self {
let terminal_metrics = OutputMetrics::new();
let output = attach_terminal_metrics(output, &terminal_metrics);
Self {
output,
metrics: terminal_metrics,
}
}
/// Returns proved per-region watermarks from the terminal metrics.
pub fn region_watermark_map(&self) -> Option<std::collections::HashMap<u64, u64>> {
self.metrics.region_watermark_map()
}
/// Returns all regions participating in terminal metric collection.
pub fn participating_regions(&self) -> Option<std::collections::BTreeSet<u64>> {
self.metrics.participating_regions()
}
/// Drops the terminal metrics handle and returns the original output.
pub fn into_output(self) -> Output {
self.output
}
}
fn parse_terminal_metrics(metrics_json: &str) -> Result<RecordBatchMetrics> {
serde_json::from_str(metrics_json).map_err(|e| {
IllegalFlightMessagesSnafu {
reason: format!("Invalid terminal metrics message: {e}"),
}
.build()
})
}
struct StreamWithMetrics {
stream: common_recordbatch::SendableRecordBatchStream,
metrics: OutputMetrics,
}
impl StreamWithMetrics {
fn new(stream: common_recordbatch::SendableRecordBatchStream, metrics: OutputMetrics) -> Self {
Self { stream, metrics }
}
fn sync_terminal_metrics(&self) {
self.metrics.update(self.stream.metrics());
}
}
impl RecordBatchStream for StreamWithMetrics {
fn name(&self) -> &str {
self.stream.name()
}
fn schema(&self) -> datatypes::schema::SchemaRef {
self.stream.schema()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
self.stream.output_ordering()
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
self.sync_terminal_metrics();
self.metrics.get()
}
}
impl Stream for StreamWithMetrics {
type Item = common_recordbatch::error::Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
let polled = Pin::new(&mut self.stream).poll_next(cx);
if let Poll::Ready(None) = &polled {
self.sync_terminal_metrics();
self.metrics.mark_ready();
}
polled
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.stream.size_hint()
}
}
fn attach_terminal_metrics(output: Output, terminal_metrics: &OutputMetrics) -> Output {
let Output { data, meta } = output;
let data = match data {
common_query::OutputData::Stream(stream) => {
terminal_metrics.update(stream.metrics());
common_query::OutputData::Stream(Box::pin(StreamWithMetrics::new(
stream,
terminal_metrics.clone(),
)))
}
other => {
terminal_metrics.mark_ready();
other
}
};
Output::new(data, meta)
}
async fn output_from_flight_message_stream<S>(
mut flight_message_stream: S,
) -> Result<OutputWithMetrics>
where
S: Stream<Item = Result<FlightMessage>> + Send + Unpin + 'static,
{
let Some(first_flight_message) = flight_message_stream.next().await else {
return IllegalFlightMessagesSnafu {
reason: "Expect the response not to be empty",
}
.fail();
};
let first_flight_message = first_flight_message?;
match first_flight_message {
FlightMessage::AffectedRows { rows, metrics } => {
let terminal_metrics = OutputMetrics::new();
if let Some(metrics) = metrics {
terminal_metrics.update(Some(parse_terminal_metrics(&metrics)?));
}
let next_message = flight_message_stream.next().await.transpose()?;
match next_message {
None => terminal_metrics.mark_ready(),
Some(FlightMessage::Metrics(s)) if terminal_metrics.get().is_none() => {
terminal_metrics.update(Some(parse_terminal_metrics(&s)?));
terminal_metrics.mark_ready();
}
Some(FlightMessage::Metrics(_)) => {
return IllegalFlightMessagesSnafu {
reason: "'AffectedRows' Flight metadata already carries Metrics and cannot be followed by another Metrics message",
}
.fail();
}
Some(other) => {
return IllegalFlightMessagesSnafu {
reason: format!(
"'AffectedRows' Flight message can only be followed by a Metrics message, got {other:?}"
),
}
.fail();
}
}
Ok(OutputWithMetrics {
output: Output::new_with_affected_rows(rows),
metrics: terminal_metrics,
})
}
FlightMessage::RecordBatch(_) | FlightMessage::Metrics(_) => IllegalFlightMessagesSnafu {
reason: "The first flight message cannot be a RecordBatch or Metrics message",
}
.fail(),
FlightMessage::Schema(schema) => {
let metrics = Arc::new(ArcSwapOption::from(None));
let metrics_ref = metrics.clone();
let schema = Arc::new(
datatypes::schema::Schema::try_from(schema).context(error::ConvertSchemaSnafu)?,
);
let schema_cloned = schema.clone();
let stream = Box::pin(stream!({
while let Some(flight_message_item) = flight_message_stream.next().await {
let flight_message = match flight_message_item {
Ok(message) => message,
Err(e) => {
yield Err(BoxedError::new(e)).context(ExternalSnafu);
break;
}
};
match flight_message {
FlightMessage::RecordBatch(arrow_batch) => {
yield Ok(RecordBatch::from_df_record_batch(
schema_cloned.clone(),
arrow_batch,
))
}
FlightMessage::Metrics(s) => {
match parse_terminal_metrics(&s) {
Ok(m) => {
metrics_ref.swap(Some(Arc::new(m)));
}
Err(e) => {
yield Err(BoxedError::new(e)).context(ExternalSnafu);
}
};
}
FlightMessage::AffectedRows { .. } | FlightMessage::Schema(_) => {
yield IllegalFlightMessagesSnafu {
reason: format!(
"A Schema message must be succeeded exclusively by a set of RecordBatch messages, flight_message: {:?}",
flight_message
)
}
.fail()
.map_err(BoxedError::new)
.context(ExternalSnafu);
break;
}
}
}
}));
let record_batch_stream = RecordBatchStreamWrapper {
schema,
stream,
output_ordering: None,
metrics,
span: Span::current(),
};
Ok(OutputWithMetrics::from_output(Output::new_with_stream(
Box::pin(record_batch_stream),
)))
}
}
}
#[derive(Clone, Debug, Default)]
pub struct Database {
// The "catalog" and "schema" to be used in processing the requests at the server side.
@@ -238,6 +549,22 @@ impl Database {
Ok(())
}
fn put_flow_extensions(
metadata: &mut MetadataMap,
flow_extensions: &[(&str, &str)],
) -> Result<()> {
if flow_extensions.is_empty() {
return Ok(());
}
let value = serde_json::to_string(&flow_extensions.to_vec())
.expect("flow extension pairs should serialize");
let key = AsciiMetadataKey::from_static(FLOW_EXTENSIONS_METADATA_KEY);
let value = AsciiMetadataValue::from_str(&value).context(InvalidTonicMetadataValueSnafu)?;
metadata.insert(key, value);
Ok(())
}
/// Make a request to the database.
pub async fn handle(&self, request: Request) -> Result<u32> {
let mut client = make_database_client(&self.client)?;
@@ -333,15 +660,58 @@ impl Database {
let request = Request::Query(QueryRequest {
query: Some(Query::Sql(sql.as_ref().to_string())),
});
self.do_get(request, hints).await
self.do_get(request, hints, &[])
.await
.map(OutputWithMetrics::into_output)
}
/// Executes a SQL query and returns the output with terminal metrics.
///
/// For stream outputs, callers must consume the stream before reading final
/// terminal metrics from [`OutputWithMetrics::metrics`].
pub async fn sql_with_terminal_metrics<S>(
&self,
sql: S,
hints: &[(&str, &str)],
) -> Result<OutputWithMetrics>
where
S: AsRef<str>,
{
self.query_with_terminal_metrics_and_flow_extensions(
QueryRequest {
query: Some(Query::Sql(sql.as_ref().to_string())),
},
hints,
&[],
)
.await
}
/// Executes a logical plan directly without SQL parsing.
pub async fn logical_plan(&self, logical_plan: Vec<u8>) -> Result<Output> {
let request = Request::Query(QueryRequest {
query: Some(Query::LogicalPlan(logical_plan)),
});
self.do_get(request, &[]).await
self.query_with_terminal_metrics_and_flow_extensions(
QueryRequest {
query: Some(Query::LogicalPlan(logical_plan)),
},
&[],
&[],
)
.await
.map(OutputWithMetrics::into_output)
}
/// Executes a query and carries flow extensions through Flight metadata.
///
/// This is the lower-level terminal-metrics API for Flow callers that need
/// to pass JSON-bearing flow extensions without going through hint metadata.
pub async fn query_with_terminal_metrics_and_flow_extensions(
&self,
request: QueryRequest,
hints: &[(&str, &str)],
flow_extensions: &[(&str, &str)],
) -> Result<OutputWithMetrics> {
self.do_get(Request::Query(request), hints, flow_extensions)
.await
}
/// Creates a new table using the provided table expression.
@@ -349,7 +719,9 @@ impl Database {
let request = Request::Ddl(DdlRequest {
expr: Some(DdlExpr::CreateTable(expr)),
});
self.do_get(request, &[]).await
self.do_get(request, &[], &[])
.await
.map(OutputWithMetrics::into_output)
}
/// Alters an existing table using the provided alter expression.
@@ -357,17 +729,26 @@ impl Database {
let request = Request::Ddl(DdlRequest {
expr: Some(DdlExpr::AlterTable(expr)),
});
self.do_get(request, &[]).await
self.do_get(request, &[], &[])
.await
.map(OutputWithMetrics::into_output)
}
async fn do_get(&self, request: Request, hints: &[(&str, &str)]) -> Result<Output> {
async fn do_get(
&self,
request: Request,
hints: &[(&str, &str)],
flow_extensions: &[(&str, &str)],
) -> Result<OutputWithMetrics> {
let request = self.to_rpc_request(request);
let request = Ticket {
ticket: request.encode_to_vec().into(),
};
let mut request = tonic::Request::new(request);
Self::put_hints(request.metadata_mut(), hints)?;
let metadata = request.metadata_mut();
Self::put_hints(metadata, hints)?;
Self::put_flow_extensions(metadata, flow_extensions)?;
let mut client = self.client.make_flight_client(false, false)?;
@@ -389,7 +770,7 @@ impl Database {
let flight_data_stream = response.into_inner();
let mut decoder = FlightDecoder::default();
let mut flight_message_stream = flight_data_stream.map(move |flight_data| {
let flight_message_stream = flight_data_stream.map(move |flight_data| {
flight_data
.map_err(Error::from)
.and_then(|data| decoder.try_decode(&data).context(ConvertFlightDataSnafu))?
@@ -398,70 +779,7 @@ impl Database {
})
});
let Some(first_flight_message) = flight_message_stream.next().await else {
return IllegalFlightMessagesSnafu {
reason: "Expect the response not to be empty",
}
.fail();
};
let first_flight_message = first_flight_message?;
match first_flight_message {
FlightMessage::AffectedRows(rows) => {
ensure!(
flight_message_stream.next().await.is_none(),
IllegalFlightMessagesSnafu {
reason: "Expect 'AffectedRows' Flight messages to be the one and the only!"
}
);
Ok(Output::new_with_affected_rows(rows))
}
FlightMessage::RecordBatch(_) | FlightMessage::Metrics(_) => {
IllegalFlightMessagesSnafu {
reason: "The first flight message cannot be a RecordBatch or Metrics message",
}
.fail()
}
FlightMessage::Schema(schema) => {
let schema = Arc::new(
datatypes::schema::Schema::try_from(schema)
.context(error::ConvertSchemaSnafu)?,
);
let schema_cloned = schema.clone();
let stream = Box::pin(stream!({
while let Some(flight_message) = flight_message_stream.next().await {
let flight_message = flight_message
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
match flight_message {
FlightMessage::RecordBatch(arrow_batch) => {
yield Ok(RecordBatch::from_df_record_batch(
schema_cloned.clone(),
arrow_batch,
))
}
FlightMessage::Metrics(_) => {}
FlightMessage::AffectedRows(_) | FlightMessage::Schema(_) => {
yield IllegalFlightMessagesSnafu {reason: format!("A Schema message must be succeeded exclusively by a set of RecordBatch messages, flight_message: {:?}", flight_message)}
.fail()
.map_err(BoxedError::new)
.context(ExternalSnafu);
break;
}
}
}
}));
let record_batch_stream = RecordBatchStreamWrapper {
schema,
stream,
output_ordering: None,
metrics: Default::default(),
span: Span::current(),
};
Ok(Output::new_with_stream(Box::pin(record_batch_stream)))
}
}
output_from_flight_message_stream(flight_message_stream).await
}
/// Ingest a stream of [RecordBatch]es that belong to a table, using Arrow Flight's "`DoPut`"
@@ -512,16 +830,104 @@ struct FlightContext {
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use std::sync::Arc;
use std::task::{Context, Poll};
use api::v1::auth_header::AuthScheme;
use api::v1::{AuthHeader, Basic};
use common_error::status_code::StatusCode;
use common_query::OutputData;
use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream};
use datatypes::prelude::{ConcreteDataType, VectorRef};
use datatypes::schema::{ColumnSchema, Schema};
use datatypes::vectors::Int32Vector;
use futures_util::StreamExt;
use tonic::{Code, Status};
use super::*;
use crate::error::TonicSnafu;
struct MockMetricsStream {
schema: datatypes::schema::SchemaRef,
batch: Option<RecordBatch>,
metrics: RecordBatchMetrics,
terminal_metrics_only: bool,
}
impl Stream for MockMetricsStream {
type Item = common_recordbatch::error::Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
Poll::Ready(self.batch.take().map(Ok))
}
}
impl RecordBatchStream for MockMetricsStream {
fn name(&self) -> &str {
"MockMetricsStream"
}
fn schema(&self) -> datatypes::schema::SchemaRef {
self.schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
if self.terminal_metrics_only && self.batch.is_some() {
return None;
}
Some(self.metrics.clone())
}
}
fn terminal_metrics_json() -> String {
terminal_metrics_json_with_seq(42)
}
fn terminal_metrics_json_with_seq(seq: u64) -> String {
serde_json::to_string(&RecordBatchMetrics {
region_watermarks: vec![common_recordbatch::adapter::RegionWatermarkEntry {
region_id: 7,
watermark: Some(seq),
}],
..Default::default()
})
.unwrap()
}
#[test]
fn test_put_flow_extensions_preserves_comma_bearing_values() {
let mut metadata = MetadataMap::new();
Database::put_flow_extensions(
&mut metadata,
&[
("flow.return_region_seq", "true"),
("flow.incremental_after_seqs", r#"{"1":10,"2":20}"#),
],
)
.unwrap();
let value = metadata
.get(FLOW_EXTENSIONS_METADATA_KEY)
.unwrap()
.to_str()
.unwrap();
let decoded: Vec<(String, String)> = serde_json::from_str(value).unwrap();
assert_eq!(
decoded,
vec![
("flow.return_region_seq".to_string(), "true".to_string()),
(
"flow.incremental_after_seqs".to_string(),
r#"{"1":10,"2":20}"#.to_string()
),
]
);
}
#[test]
fn test_flight_ctx() {
let mut ctx = FlightContext::default();
@@ -536,12 +942,12 @@ mod tests {
auth_scheme: Some(basic),
});
assert_matches!(
assert!(matches!(
ctx.auth_header,
Some(AuthHeader {
auth_scheme: Some(AuthScheme::Basic(_)),
})
)
));
}
#[test]
@@ -558,4 +964,198 @@ mod tests {
assert_eq!(expected.to_string(), actual.to_string());
}
#[tokio::test]
async fn test_query_with_terminal_metrics_tracks_terminal_only_metrics() {
let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
"v",
ConcreteDataType::int32_datatype(),
false,
)]));
let batch = RecordBatch::new(
schema.clone(),
vec![Arc::new(Int32Vector::from_slice([1, 2])) as VectorRef],
)
.unwrap();
let output = Output::new_with_stream(Box::pin(MockMetricsStream {
schema,
batch: Some(batch),
metrics: RecordBatchMetrics {
region_watermarks: vec![common_recordbatch::adapter::RegionWatermarkEntry {
region_id: 7,
watermark: Some(42),
}],
..Default::default()
},
terminal_metrics_only: true,
}));
let result = OutputWithMetrics::from_output(output);
let terminal_metrics = result.metrics.clone();
assert!(!terminal_metrics.is_ready());
assert!(terminal_metrics.get().is_none());
let OutputData::Stream(mut stream) = result.output.data else {
panic!("expected stream output");
};
while stream.next().await.is_some() {}
assert!(terminal_metrics.is_ready());
assert_eq!(
terminal_metrics.participating_regions(),
Some(std::collections::BTreeSet::from([7_u64]))
);
assert_eq!(
terminal_metrics.region_watermark_map(),
Some(std::collections::HashMap::from([(7_u64, 42_u64)]))
);
}
#[test]
fn test_parse_terminal_metrics_rejects_invalid_json() {
assert!(parse_terminal_metrics("{not-json}").is_err());
}
#[tokio::test]
async fn test_affected_rows_inline_metrics_are_parsed() {
let output = output_from_flight_message_stream(futures_util::stream::iter(vec![Ok(
FlightMessage::AffectedRows {
rows: 3,
metrics: Some(terminal_metrics_json()),
},
)]
as Vec<Result<FlightMessage>>))
.await
.unwrap();
assert!(matches!(output.output.data, OutputData::AffectedRows(3)));
assert!(output.metrics.is_ready());
assert_eq!(
output.metrics.region_watermark_map(),
Some(std::collections::HashMap::from([(7, 42)]))
);
}
#[tokio::test]
async fn test_affected_rows_inline_metrics_rejects_trailing_metrics() {
let metrics_json = terminal_metrics_json();
let err = output_from_flight_message_stream(futures_util::stream::iter(vec![
Ok(FlightMessage::AffectedRows {
rows: 3,
metrics: Some(metrics_json.clone()),
}),
Ok(FlightMessage::Metrics(metrics_json)),
]
as Vec<Result<FlightMessage>>))
.await
.unwrap_err();
assert!(
err.to_string().contains("already carries Metrics"),
"unexpected error: {err:?}"
);
}
#[tokio::test]
async fn test_invalid_terminal_metrics_after_record_batch_yields_batch_then_error() {
let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
"v",
ConcreteDataType::int32_datatype(),
false,
)]));
let batch = RecordBatch::new(
schema.clone(),
vec![Arc::new(Int32Vector::from_slice([1])) as VectorRef],
)
.unwrap();
let output = output_from_flight_message_stream(futures_util::stream::iter(vec![
Ok(FlightMessage::Schema(schema.arrow_schema().clone())),
Ok(FlightMessage::RecordBatch(batch.into_df_record_batch())),
Ok(FlightMessage::Metrics("{not-json}".to_string())),
]
as Vec<Result<FlightMessage>>))
.await
.unwrap();
let terminal_metrics = output.metrics.clone();
let OutputData::Stream(mut record_batch_stream) = output.output.data else {
panic!("expected stream output");
};
let batch = record_batch_stream.next().await.unwrap().unwrap();
assert_eq!(batch.num_rows(), 1);
let err = record_batch_stream.next().await.unwrap().unwrap_err();
assert_eq!("External error", err.to_string());
assert!(
format!("{err:?}").contains("Invalid terminal metrics message"),
"unexpected error: {err:?}"
);
assert!(record_batch_stream.next().await.is_none());
assert!(terminal_metrics.is_ready());
assert!(terminal_metrics.get().is_none());
}
#[tokio::test]
async fn test_record_batch_stream_continues_after_partial_metrics() {
let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
"v",
ConcreteDataType::int32_datatype(),
false,
)]));
let first_batch = RecordBatch::new(
schema.clone(),
vec![Arc::new(Int32Vector::from_slice([1])) as VectorRef],
)
.unwrap();
let second_batch = RecordBatch::new(
schema.clone(),
vec![Arc::new(Int32Vector::from_slice([2])) as VectorRef],
)
.unwrap();
let output = output_from_flight_message_stream(futures_util::stream::iter(vec![
Ok(FlightMessage::Schema(schema.arrow_schema().clone())),
Ok(FlightMessage::RecordBatch(
first_batch.into_df_record_batch(),
)),
Ok(FlightMessage::Metrics(terminal_metrics_json_with_seq(1))),
Ok(FlightMessage::RecordBatch(
second_batch.into_df_record_batch(),
)),
Ok(FlightMessage::Metrics(terminal_metrics_json_with_seq(2))),
]
as Vec<Result<FlightMessage>>))
.await
.unwrap();
let terminal_metrics = output.metrics.clone();
let OutputData::Stream(mut record_batch_stream) = output.output.data else {
panic!("expected stream output");
};
let first_batch = record_batch_stream.next().await.unwrap().unwrap();
assert_eq!(first_batch.num_rows(), 1);
let second_batch = record_batch_stream.next().await.unwrap().unwrap();
assert_eq!(second_batch.num_rows(), 1);
assert!(record_batch_stream.next().await.is_none());
assert!(terminal_metrics.is_ready());
assert_eq!(
terminal_metrics.region_watermark_map(),
Some(std::collections::HashMap::from([(7, 2)]))
);
}
#[test]
fn test_output_metrics_distinguishes_empty_region_watermarks_from_absence() {
let metrics = OutputMetrics::default();
metrics.update(Some(RecordBatchMetrics::default()));
assert_eq!(
metrics.participating_regions(),
Some(std::collections::BTreeSet::new())
);
assert_eq!(
metrics.region_watermark_map(),
Some(std::collections::HashMap::new())
);
}
}

View File

@@ -173,20 +173,31 @@ impl ErrorExt for Error {
define_from_tonic_status!(Error, Tonic);
impl Error {
pub fn should_retry(&self) -> bool {
// TODO(weny): figure out each case of these codes.
matches!(
self,
Self::RegionServer {
code: Code::Cancelled,
..
} | Self::RegionServer {
code: Code::DeadlineExceeded,
..
} | Self::RegionServer {
code: Code::Unavailable,
..
/// Returns the gRPC status code if this error is caused by a gRPC request failure.
pub fn tonic_code(&self) -> Option<Code> {
match self {
Self::FlightGet { tonic_code, .. }
| Self::RegionServer {
code: tonic_code, ..
}
)
| Self::FlowServer {
code: tonic_code, ..
}
| Self::Tonic { tonic_code, .. } => Some(*tonic_code),
_ => None,
}
}
/// Returns true if the error is a connection error that may be resolved by retrying the request.
pub fn is_connection_error(&self) -> bool {
matches!(self.tonic_code(), Some(Code::Unavailable))
}
pub fn should_retry(&self) -> bool {
self.is_connection_error()
|| matches!(
self.tonic_code(),
Some(Code::Cancelled) | Some(Code::DeadlineExceeded)
)
}
}

View File

@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#![feature(assert_matches)]
mod client;
pub mod client_manager;
pub mod database;
@@ -34,7 +32,7 @@ pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
use snafu::OptionExt;
pub use self::client::Client;
pub use self::database::Database;
pub use self::database::{Database, OutputMetrics, OutputWithMetrics};
pub use self::error::{Error, Result};
use crate::error::{IllegalDatabaseResponseSnafu, ServerSnafu};

View File

@@ -30,10 +30,6 @@ base64.workspace = true
cache.workspace = true
catalog.workspace = true
chrono.workspace = true
datafusion-physical-plan.workspace = true
datafusion.workspace = true
datafusion-common.workspace = true
either = "1.15"
clap.workspace = true
cli.workspace = true
client.workspace = true
@@ -51,14 +47,19 @@ common-procedure.workspace = true
common-query.workspace = true
common-recordbatch.workspace = true
common-runtime.workspace = true
common-stat.workspace = true
common-telemetry = { workspace = true, features = [
"deadlock_detection",
] }
common-time.workspace = true
common-version.workspace = true
common-wal.workspace = true
datafusion.workspace = true
datafusion-common.workspace = true
datafusion-physical-plan.workspace = true
datanode.workspace = true
datatypes.workspace = true
either = "1.15"
etcd-client.workspace = true
flow.workspace = true
frontend = { workspace = true, default-features = false }
@@ -71,7 +72,7 @@ meta-client.workspace = true
meta-srv.workspace = true
metric-engine.workspace = true
mito2.workspace = true
moka.workspace = true
moka = { workspace = true, features = ["future"] }
object-store.workspace = true
parquet = { workspace = true, features = ["object_store"] }
plugins.workspace = true
@@ -81,21 +82,19 @@ query.workspace = true
rand.workspace = true
regex.workspace = true
reqwest.workspace = true
standalone.workspace = true
serde.workspace = true
serde_json.workspace = true
servers.workspace = true
session.workspace = true
similar-asserts.workspace = true
snafu.workspace = true
common-stat.workspace = true
sqlparser.workspace = true
standalone.workspace = true
store-api.workspace = true
table.workspace = true
tokio.workspace = true
toml.workspace = true
tonic.workspace = true
tracing-appender.workspace = true
sqlparser.workspace = true
[target.'cfg(unix)'.dependencies]
pprof = { version = "0.14", features = [
@@ -110,14 +109,9 @@ api.workspace = true
client = { workspace = true, features = ["testing"] }
common-test-util.workspace = true
common-version.workspace = true
serde.workspace = true
temp-env = "0.3"
tempfile.workspace = true
file-engine.workspace = true
mito2.workspace = true
[target.'cfg(not(windows))'.dev-dependencies]
rexpect = "0.5"
[package.metadata.cargo-udeps.ignore]
development = ["rexpect"]
serde.workspace = true
similar-asserts.workspace = true
temp-env = "0.3"
tempfile.workspace = true

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#![recursion_limit = "256"]
#![doc = include_str!("../../../../README.md")]
use clap::{Parser, Subcommand};
@@ -20,11 +21,11 @@ use cmd::error::{InitTlsProviderSnafu, Result};
use cmd::options::GlobalOptions;
use cmd::{App, cli, datanode, flownode, frontend, metasrv, standalone};
use common_base::Plugins;
use common_version::{verbose_version, version};
use servers::install_ring_crypto_provider;
use common_version::{product_name, verbose_version, version};
use servers::install_default_crypto_provider;
#[derive(Parser)]
#[command(name = "greptime", author, version, long_version = verbose_version(), about)]
#[command(name = product_name(), author, version, long_version = verbose_version(), about)]
#[command(propagate_version = true)]
pub(crate) struct Command {
#[clap(subcommand)]
@@ -52,11 +53,11 @@ enum SubCommand {
#[clap(name = "metasrv")]
Metasrv(metasrv::Command),
/// Run greptimedb as a standalone service.
/// Start service in standalone mode.
#[clap(name = "standalone")]
Standalone(standalone::Command),
/// Execute the cli tools for greptimedb.
/// Execute the cli tools.
#[clap(name = "cli")]
Cli(cli::Command),
}
@@ -97,7 +98,7 @@ async fn main() -> Result<()> {
async fn main_body() -> Result<()> {
setup_human_panic();
install_ring_crypto_provider().map_err(|msg| InitTlsProviderSnafu { msg }.build())?;
install_default_crypto_provider().map_err(|msg| InitTlsProviderSnafu { msg }.build())?;
start(Command::parse()).await
}
@@ -148,7 +149,7 @@ async fn start(cli: Command) -> Result<()> {
fn setup_human_panic() {
human_panic::setup_panic!(
human_panic::Metadata::new("GreptimeDB", version())
human_panic::Metadata::new(product_name(), version())
.homepage("https://github.com/GreptimeTeam/greptimedb/discussions")
);

View File

@@ -102,31 +102,79 @@ impl Command {
#[cfg(test)]
mod tests {
use std::net::TcpListener;
use std::ops::RangeInclusive;
use clap::Parser;
use client::{Client, Database};
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_telemetry::logging::LoggingOptions;
use rand::Rng;
use crate::error::Result as CmdResult;
use crate::options::GlobalOptions;
use crate::{App, cli, standalone};
fn random_standalone_addrs() -> (String, String, String, String) {
let offset = choose_random_unused_port_offset(14000..=24000, 10);
(
format!("127.0.0.1:{}", 4000 + offset),
format!("127.0.0.1:{}", 4001 + offset),
format!("127.0.0.1:{}", 4002 + offset),
format!("127.0.0.1:{}", 4003 + offset),
)
}
fn choose_random_unused_port_offset(
port_range: RangeInclusive<u16>,
max_attempts: usize,
) -> u16 {
let mut rng = rand::rng();
for _ in 0..max_attempts {
let http_port = rng.random_range(port_range.clone());
let offset = http_port - 4000;
let ports = [4000 + offset, 4001 + offset, 4002 + offset, 4003 + offset];
let listeners = ports
.into_iter()
.map(|port| TcpListener::bind(("127.0.0.1", port)))
.collect::<Result<Vec<_>, _>>();
if listeners.is_ok() {
return offset;
}
}
panic!("failed to find unused standalone test ports");
}
#[tokio::test(flavor = "multi_thread")]
async fn test_export_create_table_with_quoted_names() -> CmdResult<()> {
let output_dir = tempfile::tempdir().unwrap();
let (http_addr, rpc_addr, mysql_addr, postgres_addr) = random_standalone_addrs();
let standalone = standalone::Command::parse_from([
"standalone",
"start",
"--data-home",
&*output_dir.path().to_string_lossy(),
"--http-addr",
&http_addr,
"--grpc-bind-addr",
&rpc_addr,
"--mysql-addr",
&mysql_addr,
"--postgres-addr",
&postgres_addr,
]);
let standalone_opts = standalone.load_options(&GlobalOptions::default()).unwrap();
let mut instance = standalone.build(standalone_opts).await?;
instance.start().await?;
let client = Client::with_urls(["127.0.0.1:4001"]);
let client = Client::with_urls([rpc_addr.as_str()]);
let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
database
.sql(r#"CREATE DATABASE "cli.export.create_table";"#)
@@ -149,7 +197,7 @@ mod tests {
"data",
"export",
"--addr",
"127.0.0.1:4000",
&http_addr,
"--output-dir",
&*output_dir.path().to_string_lossy(),
"--target",

View File

@@ -197,13 +197,17 @@ pub struct StartCommand {
#[clap(long)]
node_id: Option<u64>,
/// The address to bind the gRPC server.
#[clap(long, alias = "rpc-addr")]
rpc_bind_addr: Option<String>,
#[clap(long = "grpc-bind-addr", alias = "rpc-bind-addr", alias = "rpc-addr")]
grpc_bind_addr: Option<String>,
/// The address advertised to the metasrv, and used for connections from outside the host.
/// If left empty or unset, the server will automatically use the IP address of the first network interface
/// on the host, with the same port number as the one specified in `rpc_bind_addr`.
#[clap(long, alias = "rpc-hostname")]
rpc_server_addr: Option<String>,
/// on the host, with the same port number as the one specified in `grpc_bind_addr`.
#[clap(
long = "grpc-server-addr",
alias = "rpc-server-addr",
alias = "rpc-hostname"
)]
grpc_server_addr: Option<String>,
#[clap(long, value_delimiter = ',', num_args = 1..)]
metasrv_addrs: Option<Vec<String>>,
#[clap(short, long)]
@@ -256,20 +260,20 @@ impl StartCommand {
tokio_console_addr: global_options.tokio_console_addr.clone(),
};
if let Some(addr) = &self.rpc_bind_addr {
if let Some(addr) = &self.grpc_bind_addr {
opts.grpc.bind_addr.clone_from(addr);
} else if let Some(addr) = &opts.rpc_addr {
warn!(
"Use the deprecated attribute `DatanodeOptions.rpc_addr`, please use `grpc.addr` instead."
"Use the deprecated attribute `DatanodeOptions.rpc_addr`, please use `grpc.bind_addr` instead."
);
opts.grpc.bind_addr.clone_from(addr);
}
if let Some(server_addr) = &self.rpc_server_addr {
if let Some(server_addr) = &self.grpc_server_addr {
opts.grpc.server_addr.clone_from(server_addr);
} else if let Some(server_addr) = &opts.rpc_hostname {
warn!(
"Use the deprecated attribute `DatanodeOptions.rpc_hostname`, please use `grpc.hostname` instead."
"Use the deprecated attribute `DatanodeOptions.rpc_hostname`, please use `grpc.server_addr` instead."
);
opts.grpc.server_addr.clone_from(server_addr);
}
@@ -356,10 +360,11 @@ impl StartCommand {
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use std::assert_matches;
use std::io::Write;
use std::time::Duration;
use clap::{CommandFactory, Parser};
use common_config::ENV_VAR_SEP;
use common_test_util::temp_dir::create_named_temp_file;
use object_store::config::{FileConfig, GcsConfig, ObjectStoreConfig, S3Config};
@@ -402,8 +407,8 @@ mod tests {
node_id = 42
[grpc]
addr = "127.0.0.1:3001"
hostname = "127.0.0.1"
bind_addr = "127.0.0.1:3001"
server_addr = "127.0.0.1"
runtime_size = 8
[meta_client]
@@ -449,6 +454,7 @@ mod tests {
let options = cmd.load_options(&Default::default()).unwrap().component;
assert_eq!("127.0.0.1:3001".to_string(), options.grpc.bind_addr);
assert_eq!("127.0.0.1".to_string(), options.grpc.server_addr);
assert_eq!(Some(42), options.node_id);
let DatanodeWalConfig::RaftEngine(raft_engine_config) = options.wal else {
@@ -661,4 +667,55 @@ mod tests {
},
);
}
#[test]
fn test_parse_grpc_cli_aliases() {
let command = StartCommand::try_parse_from([
"datanode",
"--grpc-bind-addr",
"127.0.0.1:13001",
"--grpc-server-addr",
"10.0.0.1:13001",
])
.unwrap();
assert_eq!(command.grpc_bind_addr.as_deref(), Some("127.0.0.1:13001"));
assert_eq!(command.grpc_server_addr.as_deref(), Some("10.0.0.1:13001"));
let command = StartCommand::try_parse_from([
"datanode",
"--rpc-bind-addr",
"127.0.0.1:23001",
"--rpc-server-addr",
"10.0.0.2:23001",
])
.unwrap();
assert_eq!(command.grpc_bind_addr.as_deref(), Some("127.0.0.1:23001"));
assert_eq!(command.grpc_server_addr.as_deref(), Some("10.0.0.2:23001"));
let command = StartCommand::try_parse_from([
"datanode",
"--rpc-addr",
"127.0.0.1:33001",
"--rpc-hostname",
"10.0.0.3:33001",
])
.unwrap();
assert_eq!(command.grpc_bind_addr.as_deref(), Some("127.0.0.1:33001"));
assert_eq!(command.grpc_server_addr.as_deref(), Some("10.0.0.3:33001"));
}
#[test]
fn test_help_uses_grpc_option_names() {
let mut cmd = StartCommand::command();
let mut help = Vec::new();
cmd.write_long_help(&mut help).unwrap();
let help = String::from_utf8(help).unwrap();
assert!(help.contains("--grpc-bind-addr"));
assert!(help.contains("--grpc-server-addr"));
assert!(!help.contains("--rpc-bind-addr"));
assert!(!help.contains("--rpc-server-addr"));
assert!(!help.contains("--rpc-addr"));
assert!(!help.contains("--rpc-hostname"));
}
}

View File

@@ -15,7 +15,7 @@
use std::sync::Arc;
use cache::build_datanode_cache_registry;
use catalog::kvbackend::MetaKvBackend;
use catalog::kvbackend::new_read_only_meta_kv_backend;
use common_base::Plugins;
use common_meta::cache::LayeredCacheRegistryBuilder;
use common_telemetry::info;
@@ -99,9 +99,7 @@ impl InstanceBuilder {
.await
.context(MetaClientInitSnafu)?;
let backend = Arc::new(MetaKvBackend {
client: client.clone(),
});
let backend = new_read_only_meta_kv_backend(client.clone());
let mut builder = DatanodeBuilder::new(dn_opts.clone(), plugins.clone(), backend.clone());
let registry = Arc::new(

Some files were not shown because too many files have changed in this diff Show More