mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-06-04 22:30:37 +00:00
Merge commit 'd304df6e75' into feat/flush-hook-extension-point
This commit is contained in:
2
.github/workflows/dev-build.yml
vendored
2
.github/workflows/dev-build.yml
vendored
@@ -30,7 +30,7 @@ on:
|
||||
linux_arm64_runner:
|
||||
type: choice
|
||||
description: The runner uses to build linux-arm64 artifacts
|
||||
default: ec2-c6g.4xlarge-arm64
|
||||
default: ec2-c6g.8xlarge-arm64
|
||||
options:
|
||||
- ec2-c6g.xlarge-arm64 # 4C8G
|
||||
- ec2-c6g.2xlarge-arm64 # 8C16G
|
||||
|
||||
2
.github/workflows/nightly-build.yml
vendored
2
.github/workflows/nightly-build.yml
vendored
@@ -27,7 +27,7 @@ on:
|
||||
linux_arm64_runner:
|
||||
type: choice
|
||||
description: The runner uses to build linux-arm64 artifacts
|
||||
default: ec2-c6g.4xlarge-arm64
|
||||
default: ec2-c6g.8xlarge-arm64
|
||||
options:
|
||||
- ec2-c6g.xlarge-arm64 # 4C8G
|
||||
- ec2-c6g.2xlarge-arm64 # 8C16G
|
||||
|
||||
157
.github/workflows/nightly-jsonbench.yaml
vendored
157
.github/workflows/nightly-jsonbench.yaml
vendored
@@ -1,19 +1,81 @@
|
||||
name: Nightly JSONBench
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Trigger at 00:00(Asia/Shanghai) on every weekday.
|
||||
- cron: "0 16 * * 0-4"
|
||||
workflow_run:
|
||||
workflows: [ "GreptimeDB Nightly Build" ]
|
||||
types: [ completed ]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
run_id:
|
||||
description: The nightly build workflow run id to download GreptimeDB artifacts from
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
resolve-artifact:
|
||||
name: Resolve GreptimeDB nightly artifact
|
||||
if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
artifact-name: ${{ steps.find-artifact.outputs.artifact-name }}
|
||||
run-id: ${{ steps.resolve-run-id.outputs.run-id }}
|
||||
steps:
|
||||
- name: Resolve nightly build run id
|
||||
id: resolve-run-id
|
||||
shell: bash
|
||||
env:
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
|
||||
INPUT_RUN_ID: ${{ inputs.run_id }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [[ "${EVENT_NAME}" == "workflow_dispatch" ]]; then
|
||||
run_id="${INPUT_RUN_ID}"
|
||||
else
|
||||
run_id="${WORKFLOW_RUN_ID}"
|
||||
fi
|
||||
|
||||
if [[ ! "${run_id}" =~ ^[0-9]+$ ]]; then
|
||||
echo "Invalid workflow run id: ${run_id}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "run-id=${run_id}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Find GreptimeDB nightly artifact
|
||||
id: find-artifact
|
||||
shell: bash
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
RUN_ID: ${{ steps.resolve-run-id.outputs.run-id }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
artifact_name=$(gh api "repos/${GITHUB_REPOSITORY}/actions/runs/${RUN_ID}/artifacts" --paginate \
|
||||
--jq '.artifacts[] | select(.name | test("^greptime-linux-arm64-nightly-[0-9]{8}-[0-9a-f]+$")) | .name' \
|
||||
| head -n 1)
|
||||
|
||||
if [[ -z "${artifact_name}" ]]; then
|
||||
echo "Cannot find linux arm64 nightly artifact in workflow run ${RUN_ID}."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Download GreptimeDB artifact: ${artifact_name}"
|
||||
echo "artifact-name=${artifact_name}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
allocate-runner:
|
||||
name: Allocate runner
|
||||
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
|
||||
if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
|
||||
needs: [ resolve-artifact ]
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
|
||||
@@ -43,55 +105,50 @@ jobs:
|
||||
|
||||
jsonbench:
|
||||
name: Run JSONBench
|
||||
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
|
||||
needs: [ allocate-runner ]
|
||||
if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
|
||||
needs: [ resolve-artifact, allocate-runner ]
|
||||
runs-on: ${{ needs.allocate-runner.outputs.linux-arm64-runner }}
|
||||
timeout-minutes: 120
|
||||
env:
|
||||
JSONBENCH_DATA_DIR: /home/runner/data/bluesky
|
||||
JSONBENCH_OUTPUT_PREFIX: _ubuntu-latest
|
||||
JSONBENCH_OUTPUT_PREFIX: _linux-arm64
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Download GreptimeDB nightly artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: false
|
||||
name: ${{ needs.resolve-artifact.outputs.artifact-name }}
|
||||
path: greptimedb-artifact
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
run-id: ${{ needs.resolve-artifact.outputs.run-id }}
|
||||
|
||||
- uses: arduino/setup-protoc@v3
|
||||
with:
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
|
||||
- name: Rust Cache
|
||||
uses: Swatinem/rust-cache@v2
|
||||
with:
|
||||
shared-key: "nightly-jsonbench"
|
||||
cache-all-crates: "true"
|
||||
save-if: ${{ github.ref == 'refs/heads/main' }}
|
||||
|
||||
- name: Build GreptimeDB
|
||||
run: cargo build --profile nightly --bin greptime
|
||||
|
||||
- name: Reclaim disk space
|
||||
- name: Prepare GreptimeDB binary
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
mkdir -p "${RUNNER_TEMP}/greptimedb-bin"
|
||||
cp ./target/nightly/greptime "${RUNNER_TEMP}/greptimedb-bin/greptime"
|
||||
chmod +x "${RUNNER_TEMP}/greptimedb-bin/greptime"
|
||||
|
||||
rm -rf ./target
|
||||
tar -xzf "greptimedb-artifact/${{ needs.resolve-artifact.outputs.artifact-name }}.tar.gz"
|
||||
cp "${{ needs.resolve-artifact.outputs.artifact-name }}/greptime" ./greptime
|
||||
chmod +x ./greptime
|
||||
rm -rf greptimedb-artifact "${{ needs.resolve-artifact.outputs.artifact-name }}"
|
||||
|
||||
- name: Run JSONBench
|
||||
env:
|
||||
# TODO(LFC): Change to "3" (100m) when JSON2 ingestion performance is optimized.
|
||||
JSONBENCH_DATASET: 2
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
cd "${RUNNER_TEMP}"
|
||||
cp "${RUNNER_TEMP}/greptimedb-bin/greptime" ./greptime
|
||||
chmod +x ./greptime
|
||||
export JSONBENCH_DATA_DIR="/root/data/bluesky"
|
||||
echo "Use JSONBench data directory ${JSONBENCH_DATA_DIR}"
|
||||
|
||||
echo "Cloning JSONBench"
|
||||
git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
|
||||
|
||||
echo "Downloading JSONBench dataset choice ${JSONBENCH_DATASET} to ${JSONBENCH_DATA_DIR}"
|
||||
mkdir -p "${JSONBENCH_DATA_DIR}"
|
||||
printf "${JSONBENCH_DATASET}\n" | ./JSONBench/download_data.sh
|
||||
downloaded_files=$(find "${JSONBENCH_DATA_DIR}" -type f | wc -l)
|
||||
echo "Downloaded JSONBench dataset files: ${downloaded_files}"
|
||||
|
||||
export GREPTIMEDB_STANDALONE__WAL__DIR=greptimedb_data/wal
|
||||
export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME=greptimedb_data
|
||||
@@ -100,10 +157,12 @@ jobs:
|
||||
export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB
|
||||
export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s
|
||||
|
||||
echo "Starting GreptimeDB standalone"
|
||||
./greptime standalone start > greptimedb.log 2>&1 &
|
||||
greptime_pid=$!
|
||||
trap 'kill "${greptime_pid}" 2>/dev/null || true' EXIT
|
||||
|
||||
echo "Waiting for GreptimeDB health check"
|
||||
until curl -s --fail -o /dev/null http://localhost:4000/health; do
|
||||
if ! kill -0 "${greptime_pid}" 2>/dev/null; then
|
||||
cat greptimedb.log
|
||||
@@ -111,12 +170,14 @@ jobs:
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "GreptimeDB is ready"
|
||||
|
||||
git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
|
||||
cp ./greptime JSONBench/greptimedb/greptime
|
||||
|
||||
cd JSONBench/greptimedb
|
||||
./main.sh 3 "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
|
||||
echo "Running JSONBench main.sh with dataset choice ${JSONBENCH_DATASET} and install=false"
|
||||
./main.sh ${JSONBENCH_DATASET} "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
|
||||
echo "JSONBench finished"
|
||||
|
||||
- name: Upload JSONBench results
|
||||
if: always()
|
||||
@@ -124,21 +185,21 @@ jobs:
|
||||
with:
|
||||
name: jsonbench-results
|
||||
path: |
|
||||
${{ runner.temp }}/greptimedb.log
|
||||
${{ runner.temp }}/JSONBench/greptimedb/*.log
|
||||
${{ runner.temp }}/JSONBench/greptimedb/*.total_size
|
||||
${{ runner.temp }}/JSONBench/greptimedb/*.data_size
|
||||
${{ runner.temp }}/JSONBench/greptimedb/*.index_size
|
||||
${{ runner.temp }}/JSONBench/greptimedb/*.count
|
||||
${{ runner.temp }}/JSONBench/greptimedb/*.results_runtime
|
||||
${{ runner.temp }}/JSONBench/greptimedb/*.query_results
|
||||
./greptimedb.log
|
||||
./JSONBench/greptimedb/*.log
|
||||
./JSONBench/greptimedb/*.total_size
|
||||
./JSONBench/greptimedb/*.data_size
|
||||
./JSONBench/greptimedb/*.index_size
|
||||
./JSONBench/greptimedb/*.count
|
||||
./JSONBench/greptimedb/*.results_runtime
|
||||
./JSONBench/greptimedb/*.query_results
|
||||
if-no-files-found: ignore
|
||||
retention-days: 7
|
||||
|
||||
stop-linux-arm64-runner:
|
||||
name: Stop Linux ARM64 runner
|
||||
# It's always run as the last job in the workflow to make sure that the runner is released.
|
||||
if: ${{ always() }}
|
||||
if: ${{ always() && needs.allocate-runner.outputs.linux-arm64-ec2-runner-instance-id != '' }}
|
||||
runs-on: ubuntu-latest
|
||||
needs: [
|
||||
allocate-runner,
|
||||
|
||||
683
Cargo.lock
generated
683
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -259,7 +259,7 @@ tracing-opentelemetry = "0.31.0"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] }
|
||||
typetag = "0.2"
|
||||
uuid = { version = "1.17", features = ["serde", "v4", "v7", "fast-rng"] }
|
||||
vrl = "0.25"
|
||||
vrl = "0.33"
|
||||
zstd = "0.13"
|
||||
# DO_NOT_REMOVE_THIS: END_OF_EXTERNAL_DEPENDENCIES
|
||||
|
||||
|
||||
@@ -451,6 +451,7 @@
|
||||
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
|
||||
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
|
||||
| `max_concurrent_queries` | Integer | `0` | The maximum concurrent queries allowed to be executed. Zero means unlimited. |
|
||||
| `concurrent_query_limiter_timeout` | String | `100ms` | Timeout to acquire a permit from the concurrent query limiter when `max_concurrent_queries` is reached. |
|
||||
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
|
||||
| `http` | -- | -- | The HTTP server options. |
|
||||
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
|
||||
|
||||
@@ -20,6 +20,9 @@ init_regions_parallelism = 16
|
||||
## The maximum concurrent queries allowed to be executed. Zero means unlimited.
|
||||
max_concurrent_queries = 0
|
||||
|
||||
## Timeout to acquire a permit from the concurrent query limiter when `max_concurrent_queries` is reached.
|
||||
concurrent_query_limiter_timeout = "100ms"
|
||||
|
||||
## Enable telemetry to collect anonymous usage data. Enabled by default.
|
||||
#+ enable_telemetry = true
|
||||
|
||||
|
||||
@@ -1077,7 +1077,9 @@ async fn verify_snapshot(storage: &OpenDalStorage) -> Result<VerifyReport> {
|
||||
));
|
||||
}
|
||||
let data_files = storage.list_files_recursive("data/").await?;
|
||||
if let Some(path) = data_files.first() {
|
||||
// Report the lexicographically smallest path so the message is stable
|
||||
// regardless of listing order across backends.
|
||||
if let Some(path) = data_files.iter().min() {
|
||||
report.push_error(format!(
|
||||
"Schema-only snapshot should not contain data files (found '{}')",
|
||||
path
|
||||
@@ -1103,75 +1105,113 @@ fn summarize_chunks(manifest: &Manifest) -> VerifyChunkSummary {
|
||||
}
|
||||
}
|
||||
|
||||
/// A data file declared by a completed chunk that is expected to exist in storage.
|
||||
#[derive(Debug)]
|
||||
struct ChunkFile {
|
||||
chunk_id: u32,
|
||||
path: String,
|
||||
}
|
||||
|
||||
/// Expected snapshot contents derived purely from the manifest (no object-store IO).
|
||||
///
|
||||
/// Separating planning from scanning makes it obvious which problems come from
|
||||
/// the manifest alone and which require comparing against actual storage.
|
||||
#[derive(Debug, Default)]
|
||||
struct VerifyPlan {
|
||||
/// Valid data files declared by completed chunks; each must exist in storage.
|
||||
files_to_check: Vec<ChunkFile>,
|
||||
/// All syntactically-safe data paths declared by any chunk, regardless of
|
||||
/// status. Used as the orphan-detection baseline so a listed-but-invalid
|
||||
/// file is not also reported as unexpected.
|
||||
claimed_data_files: HashSet<String>,
|
||||
/// Total data-file references in completed chunks (valid + invalid).
|
||||
data_files_total: usize,
|
||||
/// Problems detectable from the manifest alone.
|
||||
problems: Vec<VerifyProblem>,
|
||||
}
|
||||
|
||||
/// Actual data files discovered under `data/` (the only object-store IO in
|
||||
/// chunk/data-file verification).
|
||||
#[derive(Debug)]
|
||||
struct VerifyDataScan {
|
||||
existing_data_files: HashSet<String>,
|
||||
}
|
||||
|
||||
/// Result of reconciling the manifest plan against the storage scan.
|
||||
#[derive(Debug, Default)]
|
||||
struct VerifyOutcome {
|
||||
data_files_total: usize,
|
||||
data_files_verified: usize,
|
||||
problems: Vec<VerifyProblem>,
|
||||
}
|
||||
|
||||
async fn verify_chunks_and_data_files(
|
||||
storage: &OpenDalStorage,
|
||||
report: &mut VerifyReport,
|
||||
) -> Result<()> {
|
||||
let existing_files: HashSet<_> = storage
|
||||
.list_files_recursive("data/")
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut data_files_total = 0;
|
||||
let mut data_files_verified = 0;
|
||||
let mut problems = Vec::new();
|
||||
let mut seen_chunk_ids = HashSet::new();
|
||||
let mut claimed_data_files = HashSet::new();
|
||||
let plan = build_verify_plan(&report.manifest);
|
||||
let scan = scan_data_files(storage).await?;
|
||||
let outcome = reconcile_plan_with_scan(plan, &scan);
|
||||
|
||||
for chunk in &report.manifest.chunks {
|
||||
report.data_files_total = outcome.data_files_total;
|
||||
report.data_files_verified = outcome.data_files_verified;
|
||||
report.problems.extend(outcome.problems);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Builds the expected-state plan from the manifest. Pure; performs no IO.
|
||||
fn build_verify_plan(manifest: &Manifest) -> VerifyPlan {
|
||||
let mut plan = VerifyPlan::default();
|
||||
let mut seen_chunk_ids = HashSet::new();
|
||||
|
||||
for chunk in &manifest.chunks {
|
||||
if !seen_chunk_ids.insert(chunk.id) {
|
||||
problems.push(VerifyProblem {
|
||||
plan.problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!("Chunk {}: duplicate chunk id", chunk.id),
|
||||
});
|
||||
}
|
||||
for file in &chunk.files {
|
||||
if let Some(path) = safe_manifest_data_file_path(file) {
|
||||
claimed_data_files.insert(path.to_string());
|
||||
plan.claimed_data_files.insert(path.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
match chunk.status {
|
||||
ChunkStatus::Completed => {
|
||||
if chunk.files.is_empty() {
|
||||
problems.push(VerifyProblem {
|
||||
plan.problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!("Chunk {}: completed chunk has no data files", chunk.id),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
let allowed_prefixes = report
|
||||
.manifest
|
||||
let allowed_prefixes = manifest
|
||||
.schemas
|
||||
.iter()
|
||||
.map(|schema| data_dir_for_schema_chunk(schema, chunk.id))
|
||||
.collect::<Vec<_>>();
|
||||
for file in &chunk.files {
|
||||
data_files_total += 1;
|
||||
let Some(path) = valid_manifest_data_file_path(file, &allowed_prefixes) else {
|
||||
problems.push(VerifyProblem {
|
||||
plan.data_files_total += 1;
|
||||
match valid_manifest_data_file_path(file, &allowed_prefixes) {
|
||||
Some(path) => plan.files_to_check.push(ChunkFile {
|
||||
chunk_id: chunk.id,
|
||||
path: path.to_string(),
|
||||
}),
|
||||
None => plan.problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!(
|
||||
"Chunk {}: invalid data file path '{}'",
|
||||
chunk.id, file
|
||||
),
|
||||
});
|
||||
continue;
|
||||
};
|
||||
|
||||
if existing_files.contains(path) {
|
||||
data_files_verified += 1;
|
||||
} else {
|
||||
problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!("Chunk {}: missing file '{}'", chunk.id, path),
|
||||
});
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
ChunkStatus::Skipped => {
|
||||
if !chunk.files.is_empty() {
|
||||
problems.push(VerifyProblem {
|
||||
plan.problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!(
|
||||
"Chunk {}: skipped chunk should not list data files",
|
||||
@@ -1181,20 +1221,20 @@ async fn verify_chunks_and_data_files(
|
||||
}
|
||||
}
|
||||
ChunkStatus::Pending => {
|
||||
problems.push(VerifyProblem {
|
||||
plan.problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!("Chunk {}: status is 'pending'", chunk.id),
|
||||
});
|
||||
}
|
||||
ChunkStatus::InProgress => {
|
||||
problems.push(VerifyProblem {
|
||||
plan.problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!("Chunk {}: status is 'in_progress'", chunk.id),
|
||||
});
|
||||
}
|
||||
ChunkStatus::Failed => {
|
||||
let reason = chunk.error.as_deref().unwrap_or("unknown error");
|
||||
problems.push(VerifyProblem {
|
||||
plan.problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!("Chunk {}: status is 'failed' (error: {})", chunk.id, reason),
|
||||
});
|
||||
@@ -1202,20 +1242,60 @@ async fn verify_chunks_and_data_files(
|
||||
}
|
||||
}
|
||||
|
||||
for path in &existing_files {
|
||||
if !claimed_data_files.contains(path) {
|
||||
plan
|
||||
}
|
||||
|
||||
/// Lists all data files under `data/`. This is the only object-store IO in
|
||||
/// chunk/data-file verification.
|
||||
async fn scan_data_files(storage: &OpenDalStorage) -> Result<VerifyDataScan> {
|
||||
let existing_data_files = storage
|
||||
.list_files_recursive("data/")
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
Ok(VerifyDataScan {
|
||||
existing_data_files,
|
||||
})
|
||||
}
|
||||
|
||||
/// Reconciles the manifest plan against the storage scan. Pure; performs no IO.
|
||||
///
|
||||
/// Emits missing-file problems for expected files absent from storage and
|
||||
/// unexpected-file problems for storage files no chunk claims. Unexpected files
|
||||
/// are sorted by path so output is deterministic regardless of listing order.
|
||||
fn reconcile_plan_with_scan(plan: VerifyPlan, scan: &VerifyDataScan) -> VerifyOutcome {
|
||||
let mut problems = plan.problems;
|
||||
let mut data_files_verified = 0;
|
||||
|
||||
for file in &plan.files_to_check {
|
||||
if scan.existing_data_files.contains(&file.path) {
|
||||
data_files_verified += 1;
|
||||
} else {
|
||||
problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!("Unexpected data file '{}' is not listed in manifest", path),
|
||||
message: format!("Chunk {}: missing file '{}'", file.chunk_id, file.path),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
report.data_files_total = data_files_total;
|
||||
report.data_files_verified = data_files_verified;
|
||||
report.problems.extend(problems);
|
||||
let mut orphans: Vec<&String> = scan
|
||||
.existing_data_files
|
||||
.iter()
|
||||
.filter(|path| !plan.claimed_data_files.contains(*path))
|
||||
.collect();
|
||||
orphans.sort();
|
||||
for path in orphans {
|
||||
problems.push(VerifyProblem {
|
||||
severity: VerifySeverity::Error,
|
||||
message: format!("Unexpected data file '{}' is not listed in manifest", path),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(())
|
||||
VerifyOutcome {
|
||||
data_files_total: plan.data_files_total,
|
||||
data_files_verified,
|
||||
problems,
|
||||
}
|
||||
}
|
||||
|
||||
fn valid_manifest_data_file_path<'a>(
|
||||
@@ -2294,6 +2374,90 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_verify_plan_classifies_chunks_without_io() {
|
||||
let mut manifest = test_manifest(
|
||||
chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
|
||||
false,
|
||||
true,
|
||||
);
|
||||
// test_manifest(complete) gives: chunk 1 completed (1 file), chunk 2 skipped.
|
||||
let mut failed = ChunkMeta::new(3, TimeRange::unbounded());
|
||||
failed.mark_failed("boom".to_string());
|
||||
manifest.chunks.push(failed);
|
||||
manifest
|
||||
.chunks
|
||||
.push(ChunkMeta::new(4, TimeRange::unbounded()));
|
||||
|
||||
let plan = build_verify_plan(&manifest);
|
||||
|
||||
assert_eq!(plan.files_to_check.len(), 1);
|
||||
assert_eq!(plan.files_to_check[0].chunk_id, 1);
|
||||
assert_eq!(plan.files_to_check[0].path, "data/public/1/file.parquet");
|
||||
assert_eq!(plan.data_files_total, 1);
|
||||
assert!(
|
||||
plan.claimed_data_files
|
||||
.contains("data/public/1/file.parquet")
|
||||
);
|
||||
assert_eq!(plan.problems.len(), 2);
|
||||
assert!(
|
||||
plan.problems
|
||||
.iter()
|
||||
.any(|problem| problem.message.contains("status is 'failed'"))
|
||||
);
|
||||
assert!(
|
||||
plan.problems
|
||||
.iter()
|
||||
.any(|problem| problem.message.contains("status is 'pending'"))
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_verify_snapshot_produces_deterministic_problem_output() {
|
||||
let dir = tempdir().unwrap();
|
||||
let manifest = test_manifest(
|
||||
chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
|
||||
false,
|
||||
true,
|
||||
);
|
||||
write_root_manifest(dir.path(), manifest);
|
||||
write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
|
||||
write_default_ddl_files(dir.path());
|
||||
write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
|
||||
// Many orphan files under a known chunk prefix to stress ordering.
|
||||
for i in 0..50 {
|
||||
write_snapshot_file(
|
||||
dir.path(),
|
||||
&format!("data/public/1/orphan_{:02}.parquet", i),
|
||||
b"x",
|
||||
);
|
||||
}
|
||||
|
||||
let storage = file_storage_for_dir(dir.path());
|
||||
let messages = |report: &VerifyReport| {
|
||||
report
|
||||
.problems
|
||||
.iter()
|
||||
.map(|problem| problem.message.clone())
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
let first = messages(&verify_snapshot(&storage).await.unwrap());
|
||||
let second = messages(&verify_snapshot(&storage).await.unwrap());
|
||||
|
||||
// Output is identical across runs despite HashSet-based scanning.
|
||||
assert_eq!(first, second);
|
||||
|
||||
let orphans = first
|
||||
.iter()
|
||||
.filter(|message| message.contains("Unexpected data file"))
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(orphans.len(), 50);
|
||||
let mut sorted = orphans.clone();
|
||||
sorted.sort();
|
||||
assert_eq!(orphans, sorted);
|
||||
}
|
||||
|
||||
fn write_test_manifest(root: &std::path::Path, dir: &str, manifest: Manifest) {
|
||||
let snapshot_dir = root.join(dir);
|
||||
std::fs::create_dir_all(&snapshot_dir).unwrap();
|
||||
|
||||
@@ -524,6 +524,7 @@ impl ScanbenchCommand {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: !self.enable_wal,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
};
|
||||
|
||||
engine
|
||||
|
||||
@@ -61,6 +61,7 @@ pub const FORMAT_COMPRESSION_TYPE: &str = "compression_type";
|
||||
pub const FORMAT_DELIMITER: &str = "delimiter";
|
||||
pub const FORMAT_SCHEMA_INFER_MAX_RECORD: &str = "schema_infer_max_record";
|
||||
pub const FORMAT_HAS_HEADER: &str = "has_header";
|
||||
pub const FORMAT_SKIP_BAD_RECORDS: &str = "skip_bad_records";
|
||||
pub const FORMAT_TYPE: &str = "format";
|
||||
pub const FILE_PATTERN: &str = "pattern";
|
||||
pub const TIMESTAMP_FORMAT: &str = "timestamp_format";
|
||||
|
||||
@@ -13,15 +13,24 @@
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
|
||||
use arrow::csv::reader::Format;
|
||||
use arrow::csv::{self, WriterBuilder};
|
||||
use arrow::error::ArrowError;
|
||||
use arrow::record_batch::RecordBatch;
|
||||
use arrow_schema::Schema;
|
||||
use arrow_schema::{Schema, SchemaRef};
|
||||
use async_trait::async_trait;
|
||||
use bytes::{Buf, Bytes};
|
||||
use common_runtime;
|
||||
use common_telemetry::warn;
|
||||
use datafusion::physical_plan::SendableRecordBatchStream;
|
||||
use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
|
||||
use futures::StreamExt;
|
||||
use futures::stream::BoxStream;
|
||||
use object_store::ObjectStore;
|
||||
use snafu::ResultExt;
|
||||
use tokio_util::compat::FuturesAsyncReadCompatExt;
|
||||
@@ -34,9 +43,12 @@ use crate::file_format::{self, FileFormat, stream_to_file};
|
||||
use crate::share_buffer::SharedBuffer;
|
||||
use crate::util::normalize_infer_schema;
|
||||
|
||||
const SKIP_BAD_RECORDS_BATCH_SIZE: usize = 1;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct CsvFormat {
|
||||
pub has_header: bool,
|
||||
pub skip_bad_records: bool,
|
||||
pub delimiter: u8,
|
||||
pub schema_infer_max_record: Option<usize>,
|
||||
pub compression_type: CompressionType,
|
||||
@@ -76,13 +88,11 @@ impl TryFrom<&HashMap<String, String>> for CsvFormat {
|
||||
})?);
|
||||
};
|
||||
if let Some(has_header) = value.get(file_format::FORMAT_HAS_HEADER) {
|
||||
format.has_header = has_header.parse().map_err(|_| {
|
||||
error::ParseFormatSnafu {
|
||||
key: file_format::FORMAT_HAS_HEADER,
|
||||
value: has_header,
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
format.has_header = parse_bool(file_format::FORMAT_HAS_HEADER, has_header)?;
|
||||
};
|
||||
if let Some(skip_bad_records) = value.get(file_format::FORMAT_SKIP_BAD_RECORDS) {
|
||||
format.skip_bad_records =
|
||||
parse_bool(file_format::FORMAT_SKIP_BAD_RECORDS, skip_bad_records)?;
|
||||
};
|
||||
if let Some(timestamp_format) = value.get(file_format::TIMESTAMP_FORMAT) {
|
||||
format.timestamp_format = Some(timestamp_format.clone());
|
||||
@@ -97,10 +107,17 @@ impl TryFrom<&HashMap<String, String>> for CsvFormat {
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_bool(key: &'static str, value: &str) -> Result<bool> {
|
||||
value
|
||||
.parse()
|
||||
.map_err(|_| error::ParseFormatSnafu { key, value }.build())
|
||||
}
|
||||
|
||||
impl Default for CsvFormat {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
has_header: true,
|
||||
skip_bad_records: false,
|
||||
delimiter: b',',
|
||||
schema_infer_max_record: Some(file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD),
|
||||
compression_type: CompressionType::Uncompressed,
|
||||
@@ -189,10 +206,136 @@ impl DfRecordBatchEncoder for csv::Writer<SharedBuffer> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a CSV stream that can skip selected record-level parse/cast errors.
|
||||
///
|
||||
/// This recovery path intentionally uses one-record batches. It is slower than
|
||||
/// normal CSV scanning, but keeps each parse/cast failure isolated to a single
|
||||
/// record. Arrow's CSV decoder clears buffered rows before type parsing, so a
|
||||
/// failed multi-row flush cannot be safely retried row by row without replaying
|
||||
/// input bytes.
|
||||
pub async fn tolerant_csv_stream(
|
||||
store: &ObjectStore,
|
||||
path: &str,
|
||||
schema: SchemaRef,
|
||||
projection: Vec<usize>,
|
||||
format: &CsvFormat,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
let meta = store
|
||||
.stat(path)
|
||||
.await
|
||||
.context(error::ReadObjectSnafu { path })?;
|
||||
|
||||
let reader = store
|
||||
.reader(path)
|
||||
.await
|
||||
.context(error::ReadObjectSnafu { path })?
|
||||
.into_bytes_stream(0..meta.content_length())
|
||||
.await
|
||||
.context(error::ReadObjectSnafu { path })?;
|
||||
|
||||
let reader = format.compression_type.convert_stream(reader).boxed();
|
||||
tolerant_csv_stream_from_reader(
|
||||
reader,
|
||||
path,
|
||||
schema,
|
||||
projection,
|
||||
format.has_header,
|
||||
format.delimiter,
|
||||
)
|
||||
}
|
||||
|
||||
fn tolerant_csv_stream_from_reader(
|
||||
reader: BoxStream<'static, io::Result<Bytes>>,
|
||||
path: &str,
|
||||
schema: SchemaRef,
|
||||
projection: Vec<usize>,
|
||||
has_header: bool,
|
||||
delimiter: u8,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
let projected_schema = Arc::new(
|
||||
schema
|
||||
.project(&projection)
|
||||
.context(error::InferSchemaSnafu)?,
|
||||
);
|
||||
let mut decoder = csv::ReaderBuilder::new(schema)
|
||||
.with_header(has_header)
|
||||
.with_delimiter(delimiter)
|
||||
.with_batch_size(SKIP_BAD_RECORDS_BATCH_SIZE)
|
||||
.with_projection(projection)
|
||||
.build_decoder();
|
||||
|
||||
let path = path.to_string();
|
||||
let mut upstream = reader.fuse();
|
||||
let mut buffered = Bytes::new();
|
||||
let mut input_finished = false;
|
||||
let stream = futures::stream::poll_fn(move |cx| {
|
||||
loop {
|
||||
while !input_finished {
|
||||
if buffered.is_empty() {
|
||||
match futures::ready!(upstream.poll_next_unpin(cx)) {
|
||||
Some(Ok(bytes)) if bytes.is_empty() => continue,
|
||||
Some(Ok(bytes)) => buffered = bytes,
|
||||
Some(Err(error)) => return Poll::Ready(Some(Err(error.into()))),
|
||||
None => input_finished = true,
|
||||
}
|
||||
}
|
||||
|
||||
let decoded = decoder.decode(buffered.as_ref())?;
|
||||
if decoded > 0 {
|
||||
buffered.advance(decoded);
|
||||
continue;
|
||||
}
|
||||
|
||||
if decoder.capacity() == 0 || input_finished {
|
||||
break;
|
||||
}
|
||||
|
||||
if buffered.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
return Poll::Ready(Some(Err(ArrowError::ParseError(
|
||||
"CSV decoder made no progress while input bytes remain".to_string(),
|
||||
))));
|
||||
}
|
||||
|
||||
match decoder.flush() {
|
||||
Ok(Some(batch)) => return Poll::Ready(Some(Ok(batch))),
|
||||
Ok(None) if input_finished => return Poll::Ready(None),
|
||||
Ok(None) => continue,
|
||||
Err(error) if is_skippable_arrow_error(&error) => {
|
||||
warn!(
|
||||
"Skipping bad CSV record while copying from {}: {}",
|
||||
path, error
|
||||
);
|
||||
}
|
||||
Err(error) => return Poll::Ready(Some(Err(error))),
|
||||
}
|
||||
}
|
||||
})
|
||||
.map(|result: std::result::Result<RecordBatch, ArrowError>| result.map_err(Into::into));
|
||||
|
||||
Ok(Box::pin(RecordBatchStreamAdapter::new(
|
||||
projected_schema,
|
||||
stream,
|
||||
)))
|
||||
}
|
||||
|
||||
pub fn is_skippable_arrow_error(error: &ArrowError) -> bool {
|
||||
matches!(
|
||||
error,
|
||||
ArrowError::ParseError(_)
|
||||
| ArrowError::CastError(_)
|
||||
| ArrowError::ComputeError(_)
|
||||
| ArrowError::InvalidArgumentError(_)
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_schema::{DataType, Field};
|
||||
use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
|
||||
use common_recordbatch::{RecordBatch, RecordBatches};
|
||||
use common_test_util::find_workspace_path;
|
||||
@@ -205,7 +348,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::file_format::{
|
||||
FORMAT_COMPRESSION_TYPE, FORMAT_DELIMITER, FORMAT_HAS_HEADER,
|
||||
FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat, file_to_stream,
|
||||
FORMAT_SCHEMA_INFER_MAX_RECORD, FORMAT_SKIP_BAD_RECORDS, FileFormat, file_to_stream,
|
||||
};
|
||||
use crate::test_util::{format_schema, test_store};
|
||||
|
||||
@@ -331,11 +474,29 @@ mod tests {
|
||||
schema_infer_max_record: Some(2000),
|
||||
delimiter: b'\t',
|
||||
has_header: false,
|
||||
skip_bad_records: false,
|
||||
timestamp_format: None,
|
||||
time_format: None,
|
||||
date_format: None
|
||||
}
|
||||
);
|
||||
|
||||
let map = HashMap::from([(FORMAT_SKIP_BAD_RECORDS.to_string(), "true".to_string())]);
|
||||
let format = CsvFormat::try_from(&map).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
format,
|
||||
CsvFormat {
|
||||
skip_bad_records: true,
|
||||
..CsvFormat::default()
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_try_from_rejects_invalid_bool_options() {
|
||||
let map = HashMap::from([(FORMAT_SKIP_BAD_RECORDS.to_string(), "yes".to_string())]);
|
||||
assert!(CsvFormat::try_from(&map).is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -496,4 +657,63 @@ mod tests {
|
||||
assert_eq!(expected, pretty_print);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_tolerant_csv_stream_continues_after_parse_error() {
|
||||
let temp_dir = common_test_util::temp_dir::create_temp_dir("test_tolerant_csv_stream");
|
||||
let csv_file_path = temp_dir.path().join("input.csv");
|
||||
std::fs::write(
|
||||
&csv_file_path,
|
||||
"id,name,value\n1,Alice,10.5\nbad,Bad,20.0\nworse,Bad,21.0\n2,Bob,30.5",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let store = test_store("/");
|
||||
let schema = Arc::new(arrow_schema::Schema::new(vec![
|
||||
Field::new("id", DataType::UInt32, false),
|
||||
Field::new("name", DataType::Utf8, false),
|
||||
Field::new("value", DataType::Float64, false),
|
||||
]));
|
||||
let path = csv_file_path.to_str().unwrap();
|
||||
|
||||
let stream =
|
||||
tolerant_csv_stream(&store, path, schema, vec![0, 1, 2], &CsvFormat::default())
|
||||
.await
|
||||
.unwrap();
|
||||
let batches = stream.try_collect::<Vec<_>>().await.unwrap();
|
||||
let pretty_print = arrow::util::pretty::pretty_format_batches(&batches)
|
||||
.unwrap()
|
||||
.to_string();
|
||||
let expected = r#"+----+-------+-------+
|
||||
| id | name | value |
|
||||
+----+-------+-------+
|
||||
| 1 | Alice | 10.5 |
|
||||
| 2 | Bob | 30.5 |
|
||||
+----+-------+-------+"#;
|
||||
assert_eq!(expected, pretty_print);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_tolerant_csv_stream_fails_on_structural_csv_error() {
|
||||
let temp_dir =
|
||||
common_test_util::temp_dir::create_temp_dir("test_tolerant_csv_stream_csv_error");
|
||||
let csv_file_path = temp_dir.path().join("input.csv");
|
||||
std::fs::write(&csv_file_path, "id,name,value\n1,Alice,10.5\n2,Bob\n").unwrap();
|
||||
|
||||
let store = test_store("/");
|
||||
let schema = Arc::new(arrow_schema::Schema::new(vec![
|
||||
Field::new("id", DataType::UInt32, false),
|
||||
Field::new("name", DataType::Utf8, false),
|
||||
Field::new("value", DataType::Float64, false),
|
||||
]));
|
||||
let path = csv_file_path.to_str().unwrap();
|
||||
|
||||
let stream =
|
||||
tolerant_csv_stream(&store, path, schema, vec![0, 1, 2], &CsvFormat::default())
|
||||
.await
|
||||
.unwrap();
|
||||
let error = stream.try_collect::<Vec<_>>().await.unwrap_err();
|
||||
|
||||
assert!(error.to_string().contains("incorrect number of fields"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use std::time::Duration;
|
||||
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use store_api::region_engine::SyncRegionFromRequest;
|
||||
use store_api::region_request::RegionFlushReason;
|
||||
use store_api::region_request::{RegionFlushReason, RegionRequirements};
|
||||
use store_api::storage::{FileRefsManifest, GcReport, RegionId, RegionNumber};
|
||||
use strum::Display;
|
||||
use table::metadata::TableId;
|
||||
@@ -179,12 +179,24 @@ impl Display for OpenRegion {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"OpenRegion(region_ident={}, region_storage_path={})",
|
||||
self.region_ident, self.region_storage_path
|
||||
"OpenRegion(region_ident={}, region_storage_path={}, reason={:?})",
|
||||
self.region_ident, self.region_storage_path, self.reason
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// The reason why an open region instruction is triggered.
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum OpenRegionReason {
|
||||
/// Open triggered before region migration.
|
||||
RegionMigration,
|
||||
/// Open triggered by region failover.
|
||||
RegionFailover,
|
||||
/// Open triggered when adding a follower region.
|
||||
#[cfg(feature = "enterprise")]
|
||||
RegionFollower,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct OpenRegion {
|
||||
@@ -196,6 +208,10 @@ pub struct OpenRegion {
|
||||
pub region_wal_options: HashMap<RegionNumber, String>,
|
||||
#[serde(default)]
|
||||
pub skip_wal_replay: bool,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub reason: Option<OpenRegionReason>,
|
||||
#[serde(default)]
|
||||
pub requirements: RegionRequirements,
|
||||
}
|
||||
|
||||
impl OpenRegion {
|
||||
@@ -205,6 +221,8 @@ impl OpenRegion {
|
||||
region_options: HashMap<String, String>,
|
||||
region_wal_options: HashMap<RegionNumber, String>,
|
||||
skip_wal_replay: bool,
|
||||
reason: Option<OpenRegionReason>,
|
||||
requirements: RegionRequirements,
|
||||
) -> Self {
|
||||
Self {
|
||||
region_ident,
|
||||
@@ -212,6 +230,8 @@ impl OpenRegion {
|
||||
region_options,
|
||||
region_wal_options,
|
||||
skip_wal_replay,
|
||||
reason,
|
||||
requirements,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1126,11 +1146,13 @@ mod tests {
|
||||
HashMap::new(),
|
||||
HashMap::new(),
|
||||
false,
|
||||
None,
|
||||
RegionRequirements::empty(),
|
||||
)]);
|
||||
|
||||
let serialized = serde_json::to_string(&open_region).unwrap();
|
||||
assert_eq!(
|
||||
r#"{"OpenRegions":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}]}"#,
|
||||
r#"{"OpenRegions":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false,"requirements":{"object_storage":false}}]}"#,
|
||||
serialized
|
||||
);
|
||||
|
||||
@@ -1213,6 +1235,8 @@ mod tests {
|
||||
HashMap::new(),
|
||||
HashMap::new(),
|
||||
false,
|
||||
None,
|
||||
RegionRequirements::empty(),
|
||||
)]);
|
||||
assert_eq!(open_region_instruction, open_region);
|
||||
|
||||
@@ -1368,10 +1392,41 @@ mod tests {
|
||||
region_options,
|
||||
region_wal_options: HashMap::new(),
|
||||
skip_wal_replay: false,
|
||||
reason: None,
|
||||
requirements: RegionRequirements::empty(),
|
||||
};
|
||||
assert_eq!(expected, deserialized);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_open_region_with_reason_and_requirements() {
|
||||
let open_region = OpenRegion::new(
|
||||
RegionIdent {
|
||||
datanode_id: 2,
|
||||
table_id: 1024,
|
||||
region_number: 1,
|
||||
engine: "mito2".to_string(),
|
||||
},
|
||||
"test/foo",
|
||||
HashMap::new(),
|
||||
HashMap::new(),
|
||||
false,
|
||||
Some(OpenRegionReason::RegionMigration),
|
||||
RegionRequirements::object_storage(),
|
||||
);
|
||||
|
||||
let serialized = serde_json::to_string(&open_region).unwrap();
|
||||
assert!(serialized.contains(r#""reason":"RegionMigration""#));
|
||||
assert!(serialized.contains(r#""object_storage":true"#));
|
||||
|
||||
let deserialized: OpenRegion = serde_json::from_str(&serialized).unwrap();
|
||||
assert_eq!(Some(OpenRegionReason::RegionMigration), deserialized.reason);
|
||||
assert_eq!(
|
||||
RegionRequirements::object_storage(),
|
||||
deserialized.requirements
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_flush_regions_creation() {
|
||||
let region_id = RegionId::new(1024, 1);
|
||||
|
||||
@@ -14,6 +14,8 @@
|
||||
|
||||
//! Datanode configurations
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_config::{Configurable, DEFAULT_DATA_HOME};
|
||||
use common_options::memory::MemoryOptions;
|
||||
@@ -75,6 +77,10 @@ pub struct DatanodeOptions {
|
||||
pub wal: DatanodeWalConfig,
|
||||
pub storage: StorageConfig,
|
||||
pub max_concurrent_queries: usize,
|
||||
/// Timeout to acquire a permit from the concurrent query limiter when
|
||||
/// `max_concurrent_queries` is reached. Only effective when the limiter is enabled.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub concurrent_query_limiter_timeout: Duration,
|
||||
/// Options for different store engines.
|
||||
pub region_engine: Vec<RegionEngineConfig>,
|
||||
pub logging: LoggingOptions,
|
||||
@@ -131,6 +137,7 @@ impl Default for DatanodeOptions {
|
||||
wal: DatanodeWalConfig::default(),
|
||||
storage: StorageConfig::default(),
|
||||
max_concurrent_queries: 0,
|
||||
concurrent_query_limiter_timeout: Duration::from_millis(100),
|
||||
region_engine: vec![
|
||||
RegionEngineConfig::Mito(MitoConfig::default()),
|
||||
RegionEngineConfig::File(FileEngineConfig::default()),
|
||||
|
||||
@@ -445,8 +445,7 @@ impl DatanodeBuilder {
|
||||
event_listener,
|
||||
table_provider_factory,
|
||||
opts.max_concurrent_queries,
|
||||
//TODO: revaluate the hardcoded timeout on the next version of datanode concurrency limiter.
|
||||
Duration::from_millis(100),
|
||||
opts.concurrent_query_limiter_timeout,
|
||||
opts.grpc.flight_compression,
|
||||
);
|
||||
|
||||
|
||||
@@ -313,7 +313,7 @@ mod tests {
|
||||
use mito2::test_util::{CreateRequestBuilder, TestEnv};
|
||||
use store_api::path_utils::table_dir;
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::region_request::{RegionCloseRequest, RegionRequest};
|
||||
use store_api::region_request::{RegionCloseRequest, RegionRequest, RegionRequirements};
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::sync::mpsc::{self, Receiver};
|
||||
|
||||
@@ -442,6 +442,8 @@ mod tests {
|
||||
HashMap::new(),
|
||||
HashMap::new(),
|
||||
false,
|
||||
None,
|
||||
RegionRequirements::empty(),
|
||||
)])
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply};
|
||||
use common_meta::wal_provider::prepare_wal_options;
|
||||
use common_telemetry::info;
|
||||
use store_api::path_utils::table_dir;
|
||||
use store_api::region_request::{PathType, RegionOpenRequest};
|
||||
use store_api::storage::RegionId;
|
||||
@@ -41,8 +42,13 @@ impl InstructionHandler for OpenRegionsHandler {
|
||||
mut region_options,
|
||||
region_wal_options,
|
||||
skip_wal_replay,
|
||||
reason,
|
||||
requirements,
|
||||
} = open_region;
|
||||
let region_id = RegionId::new(region_ident.table_id, region_ident.region_number);
|
||||
info!(
|
||||
"Received open region instruction, region_id: {region_id}, reason: {reason:?}"
|
||||
);
|
||||
prepare_wal_options(&mut region_options, region_id, ®ion_wal_options);
|
||||
let request = RegionOpenRequest {
|
||||
engine: region_ident.engine,
|
||||
@@ -51,6 +57,7 @@ impl InstructionHandler for OpenRegionsHandler {
|
||||
options: region_options,
|
||||
skip_wal_replay,
|
||||
checkpoint: None,
|
||||
requirements,
|
||||
};
|
||||
(region_id, request)
|
||||
})
|
||||
@@ -85,7 +92,7 @@ mod tests {
|
||||
use mito2::engine::MITO_ENGINE_NAME;
|
||||
use mito2::test_util::{CreateRequestBuilder, TestEnv};
|
||||
use store_api::path_utils::table_dir;
|
||||
use store_api::region_request::{RegionCloseRequest, RegionRequest};
|
||||
use store_api::region_request::{RegionCloseRequest, RegionRequest, RegionRequirements};
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::heartbeat::handler::RegionHeartbeatResponseHandler;
|
||||
@@ -98,17 +105,21 @@ mod tests {
|
||||
) -> Instruction {
|
||||
let region_idents = region_ids
|
||||
.into_iter()
|
||||
.map(|region_id| OpenRegion {
|
||||
region_ident: RegionIdent {
|
||||
datanode_id: 0,
|
||||
table_id: region_id.table_id(),
|
||||
region_number: region_id.region_number(),
|
||||
engine: MITO_ENGINE_NAME.to_string(),
|
||||
},
|
||||
region_storage_path: storage_path.to_string(),
|
||||
region_options: HashMap::new(),
|
||||
region_wal_options: HashMap::new(),
|
||||
skip_wal_replay: false,
|
||||
.map(|region_id| {
|
||||
OpenRegion::new(
|
||||
RegionIdent {
|
||||
datanode_id: 0,
|
||||
table_id: region_id.table_id(),
|
||||
region_number: region_id.region_number(),
|
||||
engine: MITO_ENGINE_NAME.to_string(),
|
||||
},
|
||||
storage_path,
|
||||
HashMap::new(),
|
||||
HashMap::new(),
|
||||
false,
|
||||
None,
|
||||
RegionRequirements::empty(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@ use common_telemetry::{debug, error, info, warn};
|
||||
use dashmap::DashMap;
|
||||
use datafusion::datasource::TableProvider;
|
||||
use datafusion_common::tree_node::TreeNode;
|
||||
use datatypes::schema::SchemaRef;
|
||||
use either::Either;
|
||||
use futures_util::Stream;
|
||||
use futures_util::future::try_join_all;
|
||||
@@ -82,7 +83,7 @@ use store_api::region_request::{
|
||||
RegionOpenRequest, RegionRequest,
|
||||
};
|
||||
use store_api::storage::RegionId;
|
||||
use tokio::sync::{Semaphore, SemaphorePermit};
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
|
||||
use tokio::time::timeout;
|
||||
use tonic::{Request, Response, Result as TonicResult};
|
||||
|
||||
@@ -257,7 +258,7 @@ impl RegionServer {
|
||||
request: api::v1::region::QueryRequest,
|
||||
query_ctx: QueryContextRef,
|
||||
) -> Result<SendableRecordBatchStream> {
|
||||
let _permit = if let Some(p) = &self.inner.parallelism {
|
||||
let permit = if let Some(p) = &self.inner.parallelism {
|
||||
Some(p.acquire().await?)
|
||||
} else {
|
||||
None
|
||||
@@ -298,14 +299,13 @@ impl RegionServer {
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(wrap_flow_region_watermark_stream(
|
||||
stream, region_id, &query_ctx,
|
||||
))
|
||||
let stream = wrap_flow_region_watermark_stream(stream, region_id, &query_ctx);
|
||||
Ok(maybe_guard_stream(stream, permit))
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn handle_read(&self, request: QueryRequest) -> Result<SendableRecordBatchStream> {
|
||||
let _permit = if let Some(p) = &self.inner.parallelism {
|
||||
let permit = if let Some(p) = &self.inner.parallelism {
|
||||
Some(p.acquire().await?)
|
||||
} else {
|
||||
None
|
||||
@@ -332,9 +332,8 @@ impl RegionServer {
|
||||
.handle_read(QueryRequest { plan, ..request }, query_ctx.clone())
|
||||
.await?;
|
||||
|
||||
Ok(wrap_flow_region_watermark_stream(
|
||||
stream, region_id, &query_ctx,
|
||||
))
|
||||
let stream = wrap_flow_region_watermark_stream(stream, region_id, &query_ctx);
|
||||
Ok(maybe_guard_stream(stream, permit))
|
||||
}
|
||||
|
||||
/// Returns all opened and reportable regions.
|
||||
@@ -1058,7 +1057,7 @@ struct RegionServerInner {
|
||||
}
|
||||
|
||||
struct RegionServerParallelism {
|
||||
semaphore: Semaphore,
|
||||
semaphore: Arc<Semaphore>,
|
||||
timeout: Duration,
|
||||
}
|
||||
|
||||
@@ -1071,19 +1070,68 @@ impl RegionServerParallelism {
|
||||
return None;
|
||||
}
|
||||
Some(RegionServerParallelism {
|
||||
semaphore: Semaphore::new(max_concurrent_queries),
|
||||
semaphore: Arc::new(Semaphore::new(max_concurrent_queries)),
|
||||
timeout: concurrent_query_limiter_timeout,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn acquire(&self) -> Result<SemaphorePermit<'_>> {
|
||||
timeout(self.timeout, self.semaphore.acquire())
|
||||
pub async fn acquire(&self) -> Result<OwnedSemaphorePermit> {
|
||||
timeout(self.timeout, self.semaphore.clone().acquire_owned())
|
||||
.await
|
||||
.context(ConcurrentQueryLimiterTimeoutSnafu)?
|
||||
.context(ConcurrentQueryLimiterClosedSnafu)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps a record batch stream and holds a concurrency permit until the stream is
|
||||
/// fully consumed (dropped), so `max_concurrent_queries` bounds the number of
|
||||
/// in-flight read streams, not just query planning.
|
||||
struct PermitGuardedStream {
|
||||
inner: SendableRecordBatchStream,
|
||||
_permit: OwnedSemaphorePermit,
|
||||
}
|
||||
|
||||
impl RecordBatchStream for PermitGuardedStream {
|
||||
fn name(&self) -> &str {
|
||||
self.inner.name()
|
||||
}
|
||||
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.inner.schema()
|
||||
}
|
||||
|
||||
fn output_ordering(&self) -> Option<&[OrderOption]> {
|
||||
self.inner.output_ordering()
|
||||
}
|
||||
|
||||
fn metrics(&self) -> Option<RecordBatchMetrics> {
|
||||
self.inner.metrics()
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for PermitGuardedStream {
|
||||
type Item = common_recordbatch::error::Result<RecordBatch>;
|
||||
|
||||
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
self.inner.as_mut().poll_next(cx)
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps `stream` so it holds `permit` until fully consumed. Returns `stream`
|
||||
/// unchanged when no permit was acquired (limiter disabled).
|
||||
fn maybe_guard_stream(
|
||||
stream: SendableRecordBatchStream,
|
||||
permit: Option<OwnedSemaphorePermit>,
|
||||
) -> SendableRecordBatchStream {
|
||||
match permit {
|
||||
Some(permit) => Box::pin(PermitGuardedStream {
|
||||
inner: stream,
|
||||
_permit: permit,
|
||||
}),
|
||||
None => stream,
|
||||
}
|
||||
}
|
||||
|
||||
enum CurrentEngine {
|
||||
Engine(RegionEngineRef),
|
||||
EarlyReturn(AffectedRows),
|
||||
@@ -2057,6 +2105,7 @@ mod tests {
|
||||
options: Default::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -2235,6 +2284,7 @@ mod tests {
|
||||
options: Default::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
),
|
||||
(
|
||||
@@ -2246,6 +2296,7 @@ mod tests {
|
||||
options: Default::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -2268,6 +2319,7 @@ mod tests {
|
||||
options: Default::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
),
|
||||
(
|
||||
@@ -2279,6 +2331,7 @@ mod tests {
|
||||
options: Default::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -175,6 +175,7 @@ pub async fn build_region_open_requests(
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
));
|
||||
}
|
||||
@@ -193,6 +194,7 @@ pub async fn build_region_open_requests(
|
||||
options,
|
||||
skip_wal_replay: true,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
));
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ use store_api::region_engine::{
|
||||
};
|
||||
use store_api::region_request::{
|
||||
AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
|
||||
RegionRequest,
|
||||
RegionRequest, RegionRequirements,
|
||||
};
|
||||
use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
|
||||
use tokio::sync::Mutex;
|
||||
@@ -186,6 +186,24 @@ struct EngineInner {
|
||||
|
||||
type EngineInnerRef = Arc<EngineInner>;
|
||||
|
||||
fn ensure_open_requirements(
|
||||
requirements: RegionRequirements,
|
||||
object_store: &ObjectStore,
|
||||
) -> EngineResult<()> {
|
||||
if !requirements.object_storage {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
ensure!(
|
||||
object_store::util::is_object_storage(object_store),
|
||||
UnsupportedSnafu {
|
||||
operation: "open region with object storage requirement on non-object storage"
|
||||
}
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl EngineInner {
|
||||
fn new(object_store: ObjectStore) -> Self {
|
||||
Self {
|
||||
@@ -289,6 +307,8 @@ impl EngineInner {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
ensure_open_requirements(request.requirements, &self.object_store)?;
|
||||
|
||||
let res = FileRegion::open(region_id, request, &self.object_store).await;
|
||||
let region = res.inspect_err(|err| {
|
||||
error!(
|
||||
@@ -356,3 +376,53 @@ impl EngineInner {
|
||||
self.regions.read().unwrap().contains_key(®ion_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use object_store::services::{Fs, S3};
|
||||
|
||||
use super::*;
|
||||
use crate::error::Error;
|
||||
|
||||
fn build_fs_object_store() -> ObjectStore {
|
||||
ObjectStore::new(Fs::default().root("/tmp"))
|
||||
.unwrap()
|
||||
.finish()
|
||||
}
|
||||
|
||||
fn build_s3_object_store() -> ObjectStore {
|
||||
ObjectStore::new(
|
||||
S3::default()
|
||||
.bucket("test-bucket")
|
||||
.region("us-east-1")
|
||||
.disable_ec2_metadata(),
|
||||
)
|
||||
.unwrap()
|
||||
.finish()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_open_requirements_are_supported() {
|
||||
ensure_open_requirements(RegionRequirements::empty(), &build_fs_object_store()).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_object_storage_open_requirement_rejects_fs_object_store() {
|
||||
let err = ensure_open_requirements(
|
||||
RegionRequirements::object_storage(),
|
||||
&build_fs_object_store(),
|
||||
)
|
||||
.unwrap_err();
|
||||
|
||||
assert!(matches!(err, Error::Unsupported { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_object_storage_open_requirement_accepts_s3_object_store() {
|
||||
ensure_open_requirements(
|
||||
RegionRequirements::object_storage(),
|
||||
&build_s3_object_store(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,6 +181,7 @@ mod tests {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
};
|
||||
|
||||
let region = FileRegion::open(region_id, request, &object_store)
|
||||
@@ -238,6 +239,7 @@ mod tests {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
};
|
||||
let err = FileRegion::open(region_id, request, &object_store)
|
||||
.await
|
||||
|
||||
@@ -630,8 +630,11 @@ impl BatchingEngine {
|
||||
let engine = self.query_engine.clone();
|
||||
let frontend = self.frontend_client.clone();
|
||||
|
||||
// check execute once first to detect any error early
|
||||
// Create sink table if needed, then validate an existing/created sink schema before
|
||||
// spawning the background task. This catches user-created sink schema mismatches at
|
||||
// CREATE FLOW time instead of surfacing them later in the execution loop.
|
||||
task.check_or_create_sink_table(&engine, &frontend).await?;
|
||||
task.validate_sink_table_schema(&engine).await?;
|
||||
|
||||
let (start_tx, start_rx) = oneshot::channel();
|
||||
|
||||
|
||||
@@ -265,6 +265,36 @@ impl BatchingTask {
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Validates that the sink table schema can accept this flow's output.
|
||||
///
|
||||
/// This is a dry-run of the same schema matching logic used by runtime insert-plan
|
||||
/// generation, but without adding dirty-window filters or executing the query. It is used
|
||||
/// during CREATE FLOW to catch existing sink table mismatches early.
|
||||
pub async fn validate_sink_table_schema(&self, engine: &QueryEngineRef) -> Result<(), Error> {
|
||||
let (table, _) = get_table_info_df_schema(
|
||||
self.config.catalog_manager.clone(),
|
||||
self.config.sink_table_name.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let table_meta = &table.table_info().meta;
|
||||
let merge_mode_last_non_null =
|
||||
is_merge_mode_last_non_null(&table_meta.options.extra_options);
|
||||
let primary_key_indices = table_meta.primary_key_indices.clone();
|
||||
let query_ctx = self.state.read().unwrap().query_ctx.clone();
|
||||
|
||||
gen_plan_with_matching_schema(
|
||||
&self.config.query,
|
||||
query_ctx,
|
||||
engine.clone(),
|
||||
table_meta.schema.clone(),
|
||||
&primary_key_indices,
|
||||
merge_mode_last_non_null,
|
||||
)
|
||||
.await
|
||||
.map(|_| ())
|
||||
}
|
||||
|
||||
async fn is_table_exist(&self, table_name: &[String; 3]) -> Result<bool, Error> {
|
||||
self.config
|
||||
.catalog_manager
|
||||
@@ -929,7 +959,7 @@ impl BatchingTask {
|
||||
let (expire_lower_bound, expire_upper_bound) =
|
||||
match (expire_time_window_bound, &self.config.query_type) {
|
||||
(Some((Some(l), Some(u))), QueryType::Sql) => (l, u),
|
||||
(None, QueryType::Sql) => {
|
||||
(None, QueryType::Sql) if self.config.flow_eval_interval.is_none() => {
|
||||
// if it's sql query and no time window lower/upper bound is found, just return the original query(with auto columns)
|
||||
// use sink_table_meta to add to query the `update_at` and `__ts_placeholder` column's value too for compatibility reason
|
||||
debug!(
|
||||
@@ -950,7 +980,8 @@ impl BatchingTask {
|
||||
}
|
||||
_ => {
|
||||
// Clean dirty windows for full-query/non-scoped paths,
|
||||
// such as TQL, that cannot use a time-window filter.
|
||||
// such as TQL or evaluation-interval SQL without a recognized
|
||||
// time-window expression, that cannot use a time-window filter.
|
||||
let (_, dirty_windows_to_restore) = self.drain_dirty_windows_signal();
|
||||
|
||||
let plan_info = self
|
||||
|
||||
@@ -974,6 +974,38 @@ async fn test_non_scoped_path_generates_plan_with_empty_dirty_signal() {
|
||||
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_no_time_window_sql_with_eval_interval_generates_plan_without_dirty_signal() {
|
||||
let TestTaskParts {
|
||||
mut task,
|
||||
query_engine,
|
||||
..
|
||||
} = new_test_task_engine_and_plan_with_query(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
"missing_sink",
|
||||
)
|
||||
.await;
|
||||
Arc::get_mut(&mut task.config)
|
||||
.expect("test task config should be uniquely owned")
|
||||
.flow_eval_interval = Some(Duration::from_secs(60));
|
||||
task.state.write().unwrap().dirty_time_windows.clean();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", CDT::uint32_datatype(), false),
|
||||
ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
|
||||
]));
|
||||
|
||||
let plan = task
|
||||
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect(
|
||||
"eval-interval SQL without a time-window expr should run by interval, not dirty signal",
|
||||
);
|
||||
|
||||
assert!(plan.can_advance_checkpoints);
|
||||
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() {
|
||||
let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
|
||||
|
||||
@@ -33,9 +33,10 @@ use datafusion_common::{
|
||||
};
|
||||
use datafusion_expr::logical_plan::{Aggregate, TableScan};
|
||||
use datafusion_expr::{
|
||||
Distinct, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, Projection, and, binary_expr,
|
||||
bitwise_and, bitwise_or, bitwise_xor, is_null, or, when,
|
||||
Distinct, ExprSchemable, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, Projection, and,
|
||||
binary_expr, bitwise_and, bitwise_or, bitwise_xor, is_null, or, when,
|
||||
};
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::{ColumnSchema, SchemaRef};
|
||||
use query::QueryEngineRef;
|
||||
use query::parser::{DEFAULT_LOOKBACK_STRING, PromQuery, QueryLanguageParser, QueryStatement};
|
||||
@@ -955,7 +956,7 @@ pub(crate) async fn gen_plan_with_matching_schema(
|
||||
.clone()
|
||||
.rewrite(&mut add_auto_column)
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!("Failed to rewrite plan:\n {}\n", plan),
|
||||
context: "Failed to rewrite plan".to_string(),
|
||||
})?
|
||||
.data;
|
||||
Ok(plan)
|
||||
@@ -1090,33 +1091,23 @@ impl ColumnMatcherRewriter {
|
||||
}
|
||||
|
||||
/// modify the exprs in place so that it matches the schema and some auto columns are added
|
||||
fn modify_project_exprs(&mut self, mut exprs: Vec<Expr>) -> DfResult<Vec<Expr>> {
|
||||
fn modify_project_exprs(
|
||||
&mut self,
|
||||
mut exprs: Vec<Expr>,
|
||||
input_schema: &DFSchema,
|
||||
) -> DfResult<Vec<Expr>> {
|
||||
if self.allow_partial {
|
||||
return self.modify_project_exprs_with_partial(exprs);
|
||||
}
|
||||
|
||||
let original_exprs = exprs.clone();
|
||||
|
||||
let all_names = self
|
||||
.schema
|
||||
.column_schemas()
|
||||
.iter()
|
||||
.map(|c| c.name.clone())
|
||||
.collect::<BTreeSet<_>>();
|
||||
// first match by position
|
||||
for (idx, expr) in exprs.iter_mut().enumerate() {
|
||||
if !all_names.contains(&expr.qualified_name().1)
|
||||
&& let Some(col_name) = self
|
||||
.schema
|
||||
.column_schemas()
|
||||
.get(idx)
|
||||
.map(|c| c.name.clone())
|
||||
{
|
||||
// if the data type mismatched, later check_execute will error out
|
||||
// hence no need to check it here, beside, optimize pass might be able to cast it
|
||||
// so checking here is not necessary
|
||||
*expr = expr.clone().alias(col_name);
|
||||
}
|
||||
}
|
||||
|
||||
// add columns if have different column count
|
||||
let query_col_cnt = exprs.len();
|
||||
let table_col_cnt = self.schema.column_schemas().len();
|
||||
@@ -1140,10 +1131,9 @@ impl ColumnMatcherRewriter {
|
||||
// is the update at column
|
||||
exprs.push(datafusion::prelude::now().alias(&last_col_schema.name));
|
||||
} else {
|
||||
// helpful error message
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Expect the last column in table to be timestamp column, found column {} with type {:?}",
|
||||
last_col_schema.name, last_col_schema.data_type
|
||||
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
|
||||
&original_exprs,
|
||||
self.schema.as_ref(),
|
||||
)));
|
||||
}
|
||||
} else if query_col_cnt + 2 == table_col_cnt {
|
||||
@@ -1170,14 +1160,110 @@ impl ColumnMatcherRewriter {
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Expect table have 0,1 or 2 columns more than query columns, found {} query columns {:?}, {} table columns {:?}",
|
||||
query_col_cnt,
|
||||
exprs,
|
||||
table_col_cnt,
|
||||
self.schema.column_schemas()
|
||||
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
|
||||
&original_exprs,
|
||||
self.schema.as_ref(),
|
||||
)));
|
||||
}
|
||||
|
||||
self.match_extra_output_columns(exprs, input_schema, &original_exprs, &all_names)
|
||||
}
|
||||
|
||||
/// Match flow output columns whose names are not in the sink schema by the same position only.
|
||||
///
|
||||
/// This keeps the legacy "omit output aliases and map by position" behavior, but only when the
|
||||
/// sink column at the same index is actually missing from the flow output. If the extra output
|
||||
/// would be aliased to a sink column that already exists elsewhere, report a schema mismatch
|
||||
/// instead of guessing another sink column by type.
|
||||
///
|
||||
/// In particular, this intentionally rejects cross-position remaps like
|
||||
/// `record_time_window2 -> record_time_window`: they are easy to confuse with real schema
|
||||
/// mismatches and should be fixed by giving the flow output the sink column name explicitly.
|
||||
fn match_extra_output_columns(
|
||||
&self,
|
||||
mut exprs: Vec<Expr>,
|
||||
input_schema: &DFSchema,
|
||||
original_exprs: &[Expr],
|
||||
all_names: &BTreeSet<String>,
|
||||
) -> DfResult<Vec<Expr>> {
|
||||
let mut output_names = exprs
|
||||
.iter()
|
||||
.map(|expr| expr.qualified_name().1)
|
||||
.collect::<Vec<_>>();
|
||||
let output_name_set = output_names.iter().cloned().collect::<BTreeSet<_>>();
|
||||
let extra_expr_indices = output_names
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, name)| (!all_names.contains(name)).then_some(idx))
|
||||
.collect::<Vec<_>>();
|
||||
let missing_sink_indices = self
|
||||
.schema
|
||||
.column_schemas()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(idx, column)| (!output_name_set.contains(&column.name)).then_some(idx))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if extra_expr_indices.is_empty() && missing_sink_indices.is_empty() {
|
||||
return Ok(exprs);
|
||||
}
|
||||
|
||||
if extra_expr_indices.len() != missing_sink_indices.len() {
|
||||
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
|
||||
original_exprs,
|
||||
self.schema.as_ref(),
|
||||
)));
|
||||
}
|
||||
|
||||
let mut positional_matches = Vec::new();
|
||||
for expr_idx in extra_expr_indices {
|
||||
if !missing_sink_indices.contains(&expr_idx) {
|
||||
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
|
||||
original_exprs,
|
||||
self.schema.as_ref(),
|
||||
)));
|
||||
}
|
||||
|
||||
let target_col_schema = &self.schema.column_schemas()[expr_idx];
|
||||
let expr_type =
|
||||
ConcreteDataType::from_arrow_type(&exprs[expr_idx].get_type(input_schema)?);
|
||||
if is_obviously_incompatible_positional_match(&expr_type, &target_col_schema.data_type)
|
||||
{
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Cannot match flow output column '{}' to sink column '{}' by position: incompatible data types, flow output type is {:?}, sink column type is {:?}. {}",
|
||||
output_names[expr_idx],
|
||||
target_col_schema.name,
|
||||
expr_type,
|
||||
target_col_schema.data_type,
|
||||
format_flow_sink_schema_mismatch(original_exprs, self.schema.as_ref())
|
||||
)));
|
||||
}
|
||||
|
||||
let target_name = target_col_schema.name.clone();
|
||||
positional_matches.push(format!(
|
||||
"{} -> {} (flow output type: {:?}, sink column type: {:?})",
|
||||
output_names[expr_idx], target_name, expr_type, target_col_schema.data_type
|
||||
));
|
||||
exprs[expr_idx] = exprs[expr_idx].clone().alias(target_name.clone());
|
||||
output_names[expr_idx] = target_name;
|
||||
}
|
||||
|
||||
if !positional_matches.is_empty() {
|
||||
debug!(
|
||||
"Matched flow output columns to sink columns by position: {:?}",
|
||||
positional_matches
|
||||
);
|
||||
}
|
||||
|
||||
let duplicated_output_names = duplicate_names(&output_names);
|
||||
if !duplicated_output_names.is_empty() {
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Flow output schema contains duplicate column(s) after schema matching {:?}. {}",
|
||||
duplicated_output_names,
|
||||
format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(exprs)
|
||||
}
|
||||
|
||||
@@ -1186,12 +1272,9 @@ impl ColumnMatcherRewriter {
|
||||
let query_col_cnt = exprs.len();
|
||||
|
||||
if query_col_cnt > table_col_cnt {
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Expect query column count <= table column count, found {} query columns {:?}, {} table columns {:?}",
|
||||
query_col_cnt,
|
||||
exprs,
|
||||
table_col_cnt,
|
||||
self.schema.column_schemas()
|
||||
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
|
||||
&exprs,
|
||||
self.schema.as_ref(),
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -1209,8 +1292,9 @@ impl ColumnMatcherRewriter {
|
||||
.collect();
|
||||
if !missing.is_empty() {
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Column(s) {:?} required by sink table are missing from flow output when merge_mode=last_non_null",
|
||||
missing
|
||||
"Column(s) {:?} required by sink table are missing from flow output when merge_mode=last_non_null. {}",
|
||||
missing,
|
||||
format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -1250,8 +1334,9 @@ impl ColumnMatcherRewriter {
|
||||
if !remap.is_empty() {
|
||||
let extra: Vec<_> = remap.keys().cloned().collect();
|
||||
return Err(DataFusionError::Plan(format!(
|
||||
"Flow output has extra column(s) {:?} not found in sink schema when merge_mode=last_non_null",
|
||||
extra
|
||||
"Flow output has extra column(s) {:?} not found in sink schema when merge_mode=last_non_null. {}",
|
||||
extra,
|
||||
format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -1281,6 +1366,80 @@ impl ColumnMatcherRewriter {
|
||||
}
|
||||
}
|
||||
|
||||
fn is_obviously_incompatible_positional_match(
|
||||
expr_type: &ConcreteDataType,
|
||||
sink_type: &ConcreteDataType,
|
||||
) -> bool {
|
||||
// This is a coarse type-family guard for legacy positional aliasing, not a strict type equality
|
||||
// check. For example, numeric width/sign differences are allowed here and left to downstream
|
||||
// coercion, and untyped NULL can be coerced to any target type. Clearly different families such
|
||||
// as timestamp vs string are rejected early.
|
||||
if expr_type.is_null() || expr_type == sink_type {
|
||||
return false;
|
||||
}
|
||||
|
||||
expr_type.is_timestamp() != sink_type.is_timestamp()
|
||||
|| expr_type.is_string() != sink_type.is_string()
|
||||
|| expr_type.is_boolean() != sink_type.is_boolean()
|
||||
|| expr_type.is_json() != sink_type.is_json()
|
||||
|| expr_type.is_vector() != sink_type.is_vector()
|
||||
}
|
||||
|
||||
fn duplicate_names(names: &[String]) -> Vec<String> {
|
||||
let mut seen = HashSet::new();
|
||||
let mut duplicated = BTreeSet::new();
|
||||
for name in names {
|
||||
if !seen.insert(name.as_str()) {
|
||||
duplicated.insert(name.as_str());
|
||||
}
|
||||
}
|
||||
duplicated.into_iter().map(str::to_string).collect()
|
||||
}
|
||||
|
||||
fn format_flow_sink_schema_mismatch(
|
||||
query_exprs: &[Expr],
|
||||
sink_schema: &datatypes::schema::Schema,
|
||||
) -> String {
|
||||
let flow_output_columns = query_exprs
|
||||
.iter()
|
||||
.map(|expr| expr.qualified_name().1)
|
||||
.collect::<Vec<_>>();
|
||||
let sink_table_columns = sink_schema
|
||||
.column_schemas()
|
||||
.iter()
|
||||
.map(|col| col.name.clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let flow_output_set = flow_output_columns.iter().cloned().collect::<HashSet<_>>();
|
||||
let sink_table_set = sink_table_columns.iter().cloned().collect::<HashSet<_>>();
|
||||
|
||||
let mut extra_flow_columns = flow_output_columns
|
||||
.iter()
|
||||
.filter(|name| !sink_table_set.contains(*name))
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
extra_flow_columns.sort();
|
||||
extra_flow_columns.dedup();
|
||||
|
||||
let mut missing_sink_columns = sink_table_columns
|
||||
.iter()
|
||||
.filter(|name| !flow_output_set.contains(*name))
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
missing_sink_columns.sort();
|
||||
missing_sink_columns.dedup();
|
||||
|
||||
format!(
|
||||
"Flow output schema does not match sink table schema: found {} flow output columns and {} sink table columns. flow output columns: {:?}, sink table columns: {:?}, extra flow columns not in sink: {:?}, missing sink columns from flow output: {:?}",
|
||||
flow_output_columns.len(),
|
||||
sink_table_columns.len(),
|
||||
flow_output_columns,
|
||||
sink_table_columns,
|
||||
extra_flow_columns,
|
||||
missing_sink_columns
|
||||
)
|
||||
}
|
||||
|
||||
impl TreeNodeRewriter for ColumnMatcherRewriter {
|
||||
type Node = LogicalPlan;
|
||||
fn f_down(&mut self, mut node: Self::Node) -> DfResult<Transformed<Self::Node>> {
|
||||
@@ -1327,7 +1486,7 @@ impl TreeNodeRewriter for ColumnMatcherRewriter {
|
||||
// if not, wrap it in a projection
|
||||
if let LogicalPlan::Projection(project) = &node {
|
||||
let exprs = project.expr.clone();
|
||||
let exprs = self.modify_project_exprs(exprs)?;
|
||||
let exprs = self.modify_project_exprs(exprs, project.input.schema())?;
|
||||
|
||||
self.is_rewritten = true;
|
||||
let new_plan =
|
||||
@@ -1341,7 +1500,7 @@ impl TreeNodeRewriter for ColumnMatcherRewriter {
|
||||
field.name(),
|
||||
)));
|
||||
}
|
||||
let exprs = self.modify_project_exprs(exprs)?;
|
||||
let exprs = self.modify_project_exprs(exprs, node.schema())?;
|
||||
self.is_rewritten = true;
|
||||
let new_plan =
|
||||
LogicalPlan::Projection(Projection::try_new(exprs, Arc::new(node.clone()))?);
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use catalog::RegisterTableRequest;
|
||||
use common_recordbatch::RecordBatch;
|
||||
use common_time::Timestamp;
|
||||
use datafusion_common::tree_node::TreeNode as _;
|
||||
@@ -29,7 +30,9 @@ use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
|
||||
use table::test_util::MemTable;
|
||||
|
||||
use super::*;
|
||||
use crate::batching_mode::BatchingModeOptions;
|
||||
use crate::batching_mode::state::FilterExprInfo;
|
||||
use crate::batching_mode::task::{BatchingTask, TaskArgs};
|
||||
use crate::test_utils::create_test_query_engine;
|
||||
|
||||
fn u32_table(table_name: &str, columns: Vec<&str>, rows: usize) -> TableRef {
|
||||
@@ -432,9 +435,7 @@ async fn test_add_auto_column_rewriter() {
|
||||
// error datatype mismatch
|
||||
(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
Err(
|
||||
"Expect the last column in table to be timestamp column, found column atat with type Int8",
|
||||
),
|
||||
Err("missing sink columns from flow output: [\"atat\"]"),
|
||||
vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
@@ -498,6 +499,383 @@ async fn test_add_auto_column_rewriter() {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_reports_extra_flow_columns_before_positional_alias() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new(
|
||||
"max(numbers_with_ts.number)",
|
||||
ConcreteDataType::uint32_datatype(),
|
||||
true,
|
||||
),
|
||||
]));
|
||||
|
||||
let err = gen_plan_with_matching_schema(
|
||||
"SELECT number, number AS extra, ts, max(number) FROM numbers_with_ts GROUP BY number, ts",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[],
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.to_string();
|
||||
|
||||
assert!(
|
||||
err.contains("Flow output schema does not match sink table schema"),
|
||||
"{err}"
|
||||
);
|
||||
assert!(err.contains("flow output columns"), "{err}");
|
||||
assert!(err.contains("sink table columns"), "{err}");
|
||||
assert!(err.contains("extra flow columns not in sink"), "{err}");
|
||||
assert!(err.contains("extra"), "{err}");
|
||||
assert!(
|
||||
!err.contains("extra AS ts"),
|
||||
"schema error should not primarily expose positional alias: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_rejects_positional_alias_type_mismatch() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"event_time",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new(
|
||||
"max(numbers_with_ts.number)",
|
||||
ConcreteDataType::uint32_datatype(),
|
||||
true,
|
||||
),
|
||||
]));
|
||||
|
||||
let err = gen_plan_with_matching_schema(
|
||||
"SELECT number, number AS not_time, max(number) FROM numbers_with_ts GROUP BY number",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[],
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.to_string();
|
||||
|
||||
assert!(
|
||||
err.contains(
|
||||
"Cannot match flow output column 'not_time' to sink column 'event_time' by position"
|
||||
),
|
||||
"{err}"
|
||||
);
|
||||
assert!(err.contains("incompatible data types"), "{err}");
|
||||
assert!(
|
||||
!err.contains("not_time AS event_time"),
|
||||
"schema error should not expose an incompatible positional alias: {err}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_rejects_cross_position_extra_column_match() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"time_window",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
true,
|
||||
),
|
||||
]));
|
||||
|
||||
let err = gen_plan_with_matching_schema(
|
||||
"SELECT number, ts, date_bin('5 minutes', ts) AS time_window2 FROM numbers_with_ts GROUP BY number, ts, time_window2",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[],
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.to_string();
|
||||
|
||||
assert!(
|
||||
err.contains("Flow output schema does not match sink table schema"),
|
||||
"{err}"
|
||||
);
|
||||
assert!(err.contains("time_window2"), "{err}");
|
||||
assert!(err.contains("time_window"), "{err}");
|
||||
assert!(!err.contains("DuplicateUnqualifiedField"), "{err}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_accepts_out_of_order_matching_names() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"time_window",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
true,
|
||||
),
|
||||
]));
|
||||
|
||||
let plan = gen_plan_with_matching_schema(
|
||||
"SELECT number, ts, date_bin('5 minutes', ts) AS time_window FROM numbers_with_ts GROUP BY number, ts, time_window",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[],
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let output_names = plan
|
||||
.schema()
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field| field.name().clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(
|
||||
output_names,
|
||||
vec![
|
||||
"number".to_string(),
|
||||
"ts".to_string(),
|
||||
"time_window".to_string()
|
||||
]
|
||||
);
|
||||
assert!(duplicate_names(&output_names).is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_allows_numeric_positional_alias() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("renamed_number", ConcreteDataType::int64_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
]));
|
||||
|
||||
let plan = gen_plan_with_matching_schema(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[],
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let sql = df_plan_to_sql(&plan).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
"SELECT numbers_with_ts.number AS renamed_number, numbers_with_ts.ts FROM numbers_with_ts",
|
||||
sql
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_allows_null_positional_alias() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new("label", ConcreteDataType::string_datatype(), true),
|
||||
]));
|
||||
|
||||
let plan = gen_plan_with_matching_schema(
|
||||
"SELECT number, NULL AS label_placeholder FROM numbers_with_ts",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[],
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let output_names = plan
|
||||
.schema()
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field| field.name().clone())
|
||||
.collect::<Vec<_>>();
|
||||
let sql = df_plan_to_sql(&plan).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
output_names,
|
||||
vec!["number".to_string(), "label".to_string()]
|
||||
);
|
||||
assert!(sql.contains("NULL AS label"), "{sql}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_accepts_matching_flow_schema() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new("extra", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new(
|
||||
"max(numbers_with_ts.number)",
|
||||
ConcreteDataType::uint32_datatype(),
|
||||
true,
|
||||
),
|
||||
]));
|
||||
|
||||
let plan = gen_plan_with_matching_schema(
|
||||
"SELECT number, number AS extra, ts, max(number) FROM numbers_with_ts GROUP BY number, ts",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[],
|
||||
false,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let sql = df_plan_to_sql(&plan).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
"SELECT numbers_with_ts.number, numbers_with_ts.number AS extra, numbers_with_ts.ts, max(numbers_with_ts.number) FROM numbers_with_ts GROUP BY numbers_with_ts.number, numbers_with_ts.ts",
|
||||
sql
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_validate_sink_table_schema_rejects_existing_sink_missing_flow_column() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let query_ctx = QueryContext::arc();
|
||||
let sql = "SELECT number, number AS extra, max(number) FROM numbers_with_ts GROUP BY number";
|
||||
let plan = sql_to_df_plan(query_ctx.clone(), query_engine.clone(), sql, true)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let catalog_manager = catalog::memory::new_memory_catalog_manager().unwrap();
|
||||
let sink_table_name = [
|
||||
"greptime".to_string(),
|
||||
"public".to_string(),
|
||||
"existing_sink".to_string(),
|
||||
];
|
||||
let sink_table = u32_table(
|
||||
"existing_sink",
|
||||
vec!["number", "max(numbers_with_ts.number)"],
|
||||
0,
|
||||
);
|
||||
catalog_manager
|
||||
.register_table_sync(RegisterTableRequest {
|
||||
catalog: sink_table_name[0].clone(),
|
||||
schema: sink_table_name[1].clone(),
|
||||
table_name: sink_table_name[2].clone(),
|
||||
table_id: 4096,
|
||||
table: sink_table,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let (_shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
|
||||
let task = BatchingTask::try_new(TaskArgs {
|
||||
flow_id: 1,
|
||||
query: sql,
|
||||
plan,
|
||||
time_window_expr: None,
|
||||
expire_after: None,
|
||||
sink_table_name,
|
||||
source_table_names: vec![[
|
||||
"greptime".to_string(),
|
||||
"public".to_string(),
|
||||
"numbers_with_ts".to_string(),
|
||||
]],
|
||||
query_ctx,
|
||||
catalog_manager,
|
||||
shutdown_rx,
|
||||
batch_opts: Arc::new(BatchingModeOptions::default()),
|
||||
flow_eval_interval: None,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let err = task
|
||||
.validate_sink_table_schema(&query_engine)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.to_string();
|
||||
|
||||
assert!(
|
||||
err.contains("Flow output schema does not match sink table schema"),
|
||||
"{err}"
|
||||
);
|
||||
assert!(err.contains("extra"), "{err}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_allow_partial_fills_nullable_columns() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), false),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
|
||||
]));
|
||||
|
||||
let plan = gen_plan_with_matching_schema(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[0],
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let sql = df_plan_to_sql(&plan).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
"SELECT numbers_with_ts.number, numbers_with_ts.ts, NULL AS optional_value FROM numbers_with_ts",
|
||||
sql
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_find_group_by_exprs() {
|
||||
let testcases = vec![
|
||||
@@ -1491,3 +1869,118 @@ async fn test_analyze_incremental_aggregate_plan_rejects_cast_wrapped_alias() {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_last_non_null_rejects_missing_primary_key_column() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
// Sink table with primary_key_indices=[0] ("number"), time_index="ts", and merge_mode=last_non_null.
|
||||
// The flow query omits "number", which is a required primary-key column.
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
|
||||
]));
|
||||
|
||||
let err = gen_plan_with_matching_schema(
|
||||
"SELECT ts FROM numbers_with_ts",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[0],
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.to_string();
|
||||
|
||||
assert!(
|
||||
err.contains(
|
||||
"required by sink table are missing from flow output when merge_mode=last_non_null"
|
||||
),
|
||||
"{err}"
|
||||
);
|
||||
assert!(err.contains("number"), "{err}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_last_non_null_rejects_missing_time_index_column() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
// Sink table with primary_key_indices=[0] ("number"), time_index="ts", and merge_mode=last_non_null.
|
||||
// The flow query omits "ts", which is a required time-index column.
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
|
||||
]));
|
||||
|
||||
let err = gen_plan_with_matching_schema(
|
||||
"SELECT number FROM numbers_with_ts",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[0],
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.to_string();
|
||||
|
||||
assert!(
|
||||
err.contains(
|
||||
"required by sink table are missing from flow output when merge_mode=last_non_null"
|
||||
),
|
||||
"{err}"
|
||||
);
|
||||
assert!(err.contains("ts"), "{err}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_gen_plan_with_matching_schema_last_non_null_rejects_extra_flow_column() {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
// Sink table with merge_mode=last_non_null.
|
||||
// Sink has 3 columns: number (pk), ts (time_index), optional_value (nullable).
|
||||
// Flow outputs: number, number AS extra, ts → "extra" is not in sink schema.
|
||||
// query_col_cnt(3) <= table_col_cnt(3), so the extra branch is reached.
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
|
||||
ColumnSchema::new(
|
||||
"ts",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
false,
|
||||
)
|
||||
.with_time_index(true),
|
||||
ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
|
||||
]));
|
||||
|
||||
let err = gen_plan_with_matching_schema(
|
||||
"SELECT number, number AS extra, ts FROM numbers_with_ts",
|
||||
ctx,
|
||||
query_engine,
|
||||
sink_schema,
|
||||
&[0],
|
||||
true,
|
||||
)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.to_string();
|
||||
|
||||
assert!(err.contains("extra column(s)"), "{err}");
|
||||
assert!(err.contains("extra"), "{err}");
|
||||
assert!(
|
||||
err.contains("Flow output schema does not match sink table schema"),
|
||||
"{err}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -288,7 +288,6 @@ where
|
||||
|
||||
let http_server = builder
|
||||
.with_metrics_handler(MetricsHandler)
|
||||
.with_plugins(self.plugins.clone())
|
||||
.with_greptime_config_options(toml)
|
||||
.build();
|
||||
Ok(http_server)
|
||||
|
||||
@@ -1344,7 +1344,7 @@ mod tests {
|
||||
|
||||
// Generates rough 10MB data, which is larger than the default grpc message size limit.
|
||||
for i in 0..10 {
|
||||
let data: Vec<u8> = (0..1024 * 1024).map(|_| rng.random()).collect();
|
||||
let data: Vec<u8> = (0..1024 * 1024).map(|_| rng.random::<u8>()).collect();
|
||||
in_memory
|
||||
.put(
|
||||
PutRequest::new()
|
||||
|
||||
@@ -18,7 +18,9 @@ use std::ops::Div;
|
||||
use api::v1::meta::MailboxMessage;
|
||||
use common_meta::RegionIdent;
|
||||
use common_meta::distributed_time_constants::default_distributed_time_constants;
|
||||
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
|
||||
use common_meta::instruction::{
|
||||
Instruction, InstructionReply, OpenRegion, OpenRegionReason, SimpleReply,
|
||||
};
|
||||
use common_meta::key::datanode_table::RegionInfo;
|
||||
use common_procedure::{Context as ProcedureContext, Status};
|
||||
use common_telemetry::info;
|
||||
@@ -26,12 +28,13 @@ use common_telemetry::tracing_context::TracingContext;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::region_request::RegionRequirements;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::error::{self, Result};
|
||||
use crate::handler::HeartbeatMailbox;
|
||||
use crate::procedure::region_migration::flush_leader_region::PreFlushRegion;
|
||||
use crate::procedure::region_migration::{Context, State};
|
||||
use crate::procedure::region_migration::{Context, RegionMigrationTriggerReason, State};
|
||||
use crate::service::mailbox::Channel;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -67,6 +70,10 @@ impl OpenCandidateRegion {
|
||||
let region_ids = ctx.persistent_ctx.region_ids.clone();
|
||||
let from_peer_id = ctx.persistent_ctx.from_peer.id;
|
||||
let to_peer_id = ctx.persistent_ctx.to_peer.id;
|
||||
let reason = match ctx.persistent_ctx.trigger_reason {
|
||||
RegionMigrationTriggerReason::Failover => OpenRegionReason::RegionFailover,
|
||||
_ => OpenRegionReason::RegionMigration,
|
||||
};
|
||||
let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
|
||||
let mut open_regions = Vec::with_capacity(region_ids.len());
|
||||
|
||||
@@ -97,6 +104,8 @@ impl OpenCandidateRegion {
|
||||
region_options,
|
||||
region_wal_options,
|
||||
true,
|
||||
Some(reason),
|
||||
RegionRequirements::object_storage(),
|
||||
));
|
||||
}
|
||||
|
||||
@@ -233,18 +242,20 @@ mod tests {
|
||||
}
|
||||
|
||||
fn new_mock_open_instruction(datanode_id: DatanodeId, region_id: RegionId) -> Instruction {
|
||||
Instruction::OpenRegions(vec![OpenRegion {
|
||||
region_ident: RegionIdent {
|
||||
Instruction::OpenRegions(vec![OpenRegion::new(
|
||||
RegionIdent {
|
||||
datanode_id,
|
||||
table_id: region_id.table_id(),
|
||||
region_number: region_id.region_number(),
|
||||
engine: MITO2_ENGINE.to_string(),
|
||||
},
|
||||
region_storage_path: "/bar/foo/region/".to_string(),
|
||||
region_options: Default::default(),
|
||||
region_wal_options: Default::default(),
|
||||
skip_wal_replay: true,
|
||||
}])
|
||||
"/bar/foo/region/",
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
true,
|
||||
Some(OpenRegionReason::RegionMigration),
|
||||
RegionRequirements::object_storage(),
|
||||
)])
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -263,6 +274,57 @@ mod tests {
|
||||
assert!(!err.is_retryable());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_build_open_region_instruction_reason() {
|
||||
let state = OpenCandidateRegion;
|
||||
let mut persistent_context = new_persistent_context();
|
||||
let from_peer_id = persistent_context.from_peer.id;
|
||||
let region_id = persistent_context.region_ids[0];
|
||||
let env = TestingEnv::new();
|
||||
|
||||
let table_info = new_test_table_info(1024);
|
||||
let region_routes = vec![RegionRoute {
|
||||
region: Region::new_test(region_id),
|
||||
leader_peer: Some(Peer::empty(from_peer_id)),
|
||||
..Default::default()
|
||||
}];
|
||||
env.table_metadata_manager()
|
||||
.create_table_metadata(
|
||||
table_info,
|
||||
TableRouteValue::physical(region_routes),
|
||||
HashMap::default(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut ctx = env
|
||||
.context_factory()
|
||||
.new_context(persistent_context.clone());
|
||||
let instruction = state.build_open_region_instruction(&mut ctx).await.unwrap();
|
||||
let open_regions = instruction.into_open_regions().unwrap();
|
||||
assert_eq!(
|
||||
Some(OpenRegionReason::RegionMigration),
|
||||
open_regions[0].reason
|
||||
);
|
||||
assert_eq!(
|
||||
RegionRequirements::object_storage(),
|
||||
open_regions[0].requirements
|
||||
);
|
||||
|
||||
persistent_context.trigger_reason = RegionMigrationTriggerReason::Failover;
|
||||
let mut ctx = env.context_factory().new_context(persistent_context);
|
||||
let instruction = state.build_open_region_instruction(&mut ctx).await.unwrap();
|
||||
let open_regions = instruction.into_open_regions().unwrap();
|
||||
assert_eq!(
|
||||
Some(OpenRegionReason::RegionFailover),
|
||||
open_regions[0].reason
|
||||
);
|
||||
assert_eq!(
|
||||
RegionRequirements::object_storage(),
|
||||
open_regions[0].requirements
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_datanode_is_unreachable() {
|
||||
let state = OpenCandidateRegion;
|
||||
|
||||
@@ -620,6 +620,7 @@ mod test {
|
||||
options: physical_region_option,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
};
|
||||
engine
|
||||
.handle_request(physical_region_id, RegionRequest::Open(open_request))
|
||||
@@ -644,6 +645,7 @@ mod test {
|
||||
options: HashMap::new(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
};
|
||||
engine
|
||||
.handle_request(
|
||||
@@ -721,6 +723,7 @@ mod test {
|
||||
options: physical_region_option,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
};
|
||||
// Opening an already opened region should succeed.
|
||||
// Since the region is already open, no metadata recovery operations will be performed.
|
||||
@@ -749,6 +752,7 @@ mod test {
|
||||
options: physical_region_option,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
};
|
||||
let err = metric_engine
|
||||
.handle_request(physical_region_id, RegionRequest::Open(open_request))
|
||||
@@ -854,6 +858,7 @@ mod test {
|
||||
options: options.clone(),
|
||||
skip_wal_replay: true,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
)
|
||||
})
|
||||
|
||||
@@ -222,6 +222,7 @@ impl MetricEngineInner {
|
||||
entry_id: checkpoint.metadata_entry_id.unwrap_or_default(),
|
||||
metadata_entry_id: None,
|
||||
}),
|
||||
requirements: request.requirements,
|
||||
};
|
||||
|
||||
let mut data_region_options = request.options;
|
||||
@@ -239,6 +240,7 @@ impl MetricEngineInner {
|
||||
entry_id: checkpoint.entry_id,
|
||||
metadata_entry_id: None,
|
||||
}),
|
||||
requirements: request.requirements,
|
||||
};
|
||||
|
||||
(open_metadata_region_request, open_data_region_request)
|
||||
|
||||
@@ -321,6 +321,7 @@ mod tests {
|
||||
options: physical_region_option,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -144,6 +144,7 @@ impl TestEnv {
|
||||
options: physical_region_option,
|
||||
skip_wal_replay: true,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -8,6 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
test = ["common-test-util", "rstest", "rstest_reuse", "rskafka"]
|
||||
testing = ["test"]
|
||||
test-shared-fs-region-migration = []
|
||||
enterprise = []
|
||||
vector_index = ["dep:roaring", "index/vector_index"]
|
||||
|
||||
|
||||
@@ -277,6 +277,7 @@ async fn test_alter_region_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -481,6 +482,7 @@ async fn test_put_after_alter_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -844,6 +846,7 @@ async fn test_alter_column_fulltext_options_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -979,6 +982,7 @@ async fn test_alter_column_set_inverted_index_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1248,6 +1252,7 @@ async fn test_alter_region_sst_format_with_flush() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1366,6 +1371,7 @@ async fn test_alter_region_sst_format_without_flush() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1492,6 +1498,7 @@ async fn test_alter_region_sst_format_flat_to_pk_with_flush() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1610,6 +1617,7 @@ async fn test_alter_region_sst_format_flat_to_pk_without_flush() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1725,6 +1733,7 @@ async fn test_alter_region_append_mode_with_flush() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1843,6 +1852,7 @@ async fn test_alter_region_append_mode_without_flush() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -348,6 +348,7 @@ async fn test_alter_append_mode_clears_merge_mode_with_format(flat_format: bool)
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -196,6 +196,7 @@ async fn test_region_replay_with_format(factory: Option<LogStoreFactory>, flat_f
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -160,6 +160,7 @@ async fn test_batch_catchup_with_format(factory: Option<LogStoreFactory>, flat_f
|
||||
skip_wal_replay: true,
|
||||
path_type: PathType::Bare,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
)
|
||||
})
|
||||
|
||||
@@ -136,6 +136,7 @@ async fn test_batch_open_with_format(factory: Option<LogStoreFactory>, flat_form
|
||||
skip_wal_replay: false,
|
||||
path_type: PathType::Bare,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
)
|
||||
})
|
||||
@@ -149,6 +150,7 @@ async fn test_batch_open_with_format(factory: Option<LogStoreFactory>, flat_form
|
||||
skip_wal_replay: false,
|
||||
path_type: PathType::Bare,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
));
|
||||
|
||||
@@ -221,6 +223,7 @@ async fn test_batch_open_err_with_format(factory: Option<LogStoreFactory>, flat_
|
||||
skip_wal_replay: false,
|
||||
path_type: PathType::Bare,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
},
|
||||
)
|
||||
})
|
||||
|
||||
@@ -112,6 +112,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -151,6 +152,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -97,6 +97,7 @@ async fn test_catchup_with_last_entry_id(factory: Option<LogStoreFactory>) {
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -218,6 +219,7 @@ async fn test_catchup_with_incorrect_last_entry_id(factory: Option<LogStoreFacto
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -321,6 +323,7 @@ async fn test_catchup_without_last_entry_id(factory: Option<LogStoreFactory>) {
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -423,6 +426,7 @@ async fn test_catchup_with_manifest_update(factory: Option<LogStoreFactory>) {
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -527,6 +531,7 @@ async fn open_region(
|
||||
skip_wal_replay,
|
||||
path_type: PathType::Bare,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -622,6 +627,7 @@ async fn test_local_catchup(factory: Option<LogStoreFactory>) {
|
||||
skip_wal_replay: true,
|
||||
path_type: PathType::Bare,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -1023,6 +1023,7 @@ async fn test_change_region_compaction_window_with_format(flat_format: bool) {
|
||||
options: Default::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1125,6 +1126,7 @@ async fn test_open_overwrite_compaction_window_with_format(flat_format: bool) {
|
||||
options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -64,6 +64,7 @@ async fn test_engine_open_empty_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -110,6 +111,7 @@ async fn test_engine_open_existing_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -237,6 +239,7 @@ async fn test_engine_region_open_with_options_with_format(flat_format: bool) {
|
||||
options: HashMap::from([("ttl".to_string(), "4d".to_string())]),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -297,6 +300,7 @@ async fn test_engine_region_open_with_custom_store_with_format(flat_format: bool
|
||||
options: HashMap::from([("storage".to_string(), "Gcs".to_string())]),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -392,6 +396,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) {
|
||||
options: Default::default(),
|
||||
skip_wal_replay: true,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -431,6 +436,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) {
|
||||
options: Default::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -484,6 +490,7 @@ async fn test_open_region_wait_for_opening_region_ok_with_format(flat_format: bo
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -535,6 +542,7 @@ async fn test_open_region_wait_for_opening_region_err_with_format(flat_format: b
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -691,6 +699,7 @@ async fn test_open_backfills_partition_expr_with_fetcher() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -725,6 +734,7 @@ async fn test_open_backfills_partition_expr_with_fetcher() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -766,6 +776,7 @@ async fn test_open_keeps_none_without_fetcher() {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -52,6 +52,7 @@ async fn scan_in_parallel(
|
||||
skip_wal_replay: false,
|
||||
path_type: PathType::Bare,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -87,6 +87,7 @@ async fn test_close_region_skip_wal(insert: bool) {
|
||||
options: request.options.clone(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -154,6 +155,7 @@ async fn test_close_follower_region_skip_wal() {
|
||||
options: request.options.clone(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -271,6 +273,7 @@ async fn test_close_region_after_truncate_skip_wal() {
|
||||
options: request.options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -127,6 +127,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
|
||||
// Ensure the region is not replayed from the WAL.
|
||||
skip_wal_replay: true,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -239,6 +240,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) {
|
||||
// Ensure the region is not replayed from the WAL.
|
||||
skip_wal_replay: true,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -323,6 +323,7 @@ async fn test_engine_truncate_reopen_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -447,6 +448,7 @@ async fn test_engine_truncate_during_flush_with_format(flat_format: bool) {
|
||||
options: HashMap::default(),
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -916,6 +916,20 @@ pub enum Error {
|
||||
source: Arc<Error>,
|
||||
},
|
||||
|
||||
#[snafu(display(
|
||||
"Region {} does not satisfy open requirement '{}': {}",
|
||||
region_id,
|
||||
requirement,
|
||||
reason
|
||||
))]
|
||||
OpenRegionRequirement {
|
||||
region_id: RegionId,
|
||||
requirement: &'static str,
|
||||
reason: &'static str,
|
||||
#[snafu(implicit)]
|
||||
location: Location,
|
||||
},
|
||||
|
||||
#[snafu(display("Failed to parse job id"))]
|
||||
ParseJobId {
|
||||
#[snafu(implicit)]
|
||||
@@ -1376,6 +1390,7 @@ impl ErrorExt for Error {
|
||||
PrimaryKeyLengthMismatch { .. } => StatusCode::InvalidArguments,
|
||||
InvalidSender { .. } => StatusCode::InvalidArguments,
|
||||
InvalidSchedulerState { .. } => StatusCode::InvalidArguments,
|
||||
OpenRegionRequirement { .. } => StatusCode::InvalidArguments,
|
||||
DeleteSsts { .. } | DeleteIndex { .. } | DeleteIndexes { .. } => {
|
||||
StatusCode::StorageUnavailable
|
||||
}
|
||||
|
||||
@@ -27,8 +27,9 @@ use futures::future::BoxFuture;
|
||||
use log_store::kafka::log_store::KafkaLogStore;
|
||||
use log_store::noop::log_store::NoopLogStore;
|
||||
use log_store::raft_engine::log_store::RaftEngineLogStore;
|
||||
use object_store::ObjectStore;
|
||||
use object_store::manager::ObjectStoreManagerRef;
|
||||
use object_store::util::normalize_dir;
|
||||
use object_store::util::{is_object_storage, normalize_dir};
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::logstore::provider::Provider;
|
||||
@@ -36,7 +37,7 @@ use store_api::metadata::{
|
||||
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
|
||||
};
|
||||
use store_api::region_engine::RegionRole;
|
||||
use store_api::region_request::PathType;
|
||||
use store_api::region_request::{PathType, RegionRequirements};
|
||||
use store_api::storage::{ColumnId, RegionId};
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
@@ -46,8 +47,8 @@ use crate::cache::file_cache::{FileCache, FileType, IndexKey};
|
||||
use crate::config::MitoConfig;
|
||||
use crate::error;
|
||||
use crate::error::{
|
||||
EmptyRegionDirSnafu, InvalidMetadataSnafu, ObjectStoreNotFoundSnafu, RegionCorruptedSnafu,
|
||||
Result, StaleLogEntrySnafu,
|
||||
EmptyRegionDirSnafu, InvalidMetadataSnafu, InvalidRegionOptionsSnafu, ObjectStoreNotFoundSnafu,
|
||||
RegionCorruptedSnafu, Result, StaleLogEntrySnafu,
|
||||
};
|
||||
use crate::manifest::action::RegionManifest;
|
||||
use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
|
||||
@@ -206,6 +207,29 @@ impl RegionOpener {
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
/// Ensures the current region open request satisfies its requirements.
|
||||
pub(crate) fn ensure_open_requirements(&self, requirements: RegionRequirements) -> Result<()> {
|
||||
if !requirements.object_storage {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let options = self.options.as_ref().context(InvalidRegionOptionsSnafu {
|
||||
reason: "missing region options before requirement check".to_string(),
|
||||
})?;
|
||||
let object_store = get_object_store(&options.storage, &self.object_store_manager)?;
|
||||
|
||||
ensure!(
|
||||
supports_open_region_object_storage_requirement(&object_store),
|
||||
error::OpenRegionRequirementSnafu {
|
||||
region_id: self.region_id,
|
||||
requirement: "object storage",
|
||||
reason: "region data must be accessible from another datanode",
|
||||
}
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Sets the cache manager for the region.
|
||||
pub(crate) fn cache(mut self, cache_manager: Option<CacheManagerRef>) -> Self {
|
||||
self.cache_manager = cache_manager;
|
||||
@@ -597,6 +621,21 @@ impl RegionOpener {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "test-shared-fs-region-migration"))]
|
||||
fn supports_open_region_object_storage_requirement(object_store: &ObjectStore) -> bool {
|
||||
is_object_storage(object_store)
|
||||
}
|
||||
|
||||
#[cfg(feature = "test-shared-fs-region-migration")]
|
||||
fn supports_open_region_object_storage_requirement(object_store: &ObjectStore) -> bool {
|
||||
// Integration tests can configure multiple datanodes to share the same
|
||||
// temporary home dir. That makes file storage accessible to all test
|
||||
// datanodes, but production file storage still does not satisfy this
|
||||
// requirement.
|
||||
is_object_storage(object_store)
|
||||
|| object_store.info().scheme() == object_store::services::FS_SCHEME
|
||||
}
|
||||
|
||||
/// Creates a version builder from a region manifest.
|
||||
pub(crate) fn version_builder_from_manifest(
|
||||
manifest: &RegionManifest,
|
||||
@@ -1172,14 +1211,17 @@ mod tests {
|
||||
use datatypes::arrow::array::{ArrayRef, BinaryArray, Int64Array};
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use object_store::ObjectStore;
|
||||
use object_store::services::{Fs, Memory};
|
||||
use object_store::services::{Fs, Memory, S3};
|
||||
use parquet::arrow::ArrowWriter;
|
||||
use parquet::file::metadata::KeyValue;
|
||||
use parquet::file::properties::WriterProperties;
|
||||
use store_api::region_request::PathType;
|
||||
use store_api::storage::{FileId, RegionId};
|
||||
|
||||
use super::{preload_parquet_meta_cache_for_files, sanitize_region_options};
|
||||
use super::{
|
||||
preload_parquet_meta_cache_for_files, sanitize_region_options,
|
||||
supports_open_region_object_storage_requirement,
|
||||
};
|
||||
use crate::cache::CacheManager;
|
||||
use crate::cache::file_cache::{FileType, IndexKey};
|
||||
use crate::manifest::action::{RegionManifest, RemovedFilesRecord};
|
||||
@@ -1207,6 +1249,48 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_fs_object_store() -> ObjectStore {
|
||||
ObjectStore::new(Fs::default().root("/tmp"))
|
||||
.unwrap()
|
||||
.finish()
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "test-shared-fs-region-migration"))]
|
||||
fn test_open_requirement_rejects_fs_object_store() {
|
||||
let object_store = build_fs_object_store();
|
||||
|
||||
assert!(!supports_open_region_object_storage_requirement(
|
||||
&object_store
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "test-shared-fs-region-migration")]
|
||||
fn test_open_requirement_accepts_shared_fs_object_store_for_tests() {
|
||||
let object_store = build_fs_object_store();
|
||||
|
||||
assert!(supports_open_region_object_storage_requirement(
|
||||
&object_store
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_open_requirement_accepts_s3_object_store() {
|
||||
let object_store = ObjectStore::new(
|
||||
S3::default()
|
||||
.bucket("test-bucket")
|
||||
.region("us-east-1")
|
||||
.disable_ec2_metadata(),
|
||||
)
|
||||
.unwrap()
|
||||
.finish();
|
||||
|
||||
assert!(supports_open_region_object_storage_requirement(
|
||||
&object_store
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_region_options_options_format_wins() {
|
||||
// Manifest persisted PrimaryKey, but the re-parsed options now request Flat
|
||||
|
||||
@@ -1307,6 +1307,7 @@ pub async fn reopen_region(
|
||||
skip_wal_replay: false,
|
||||
path_type: PathType::Bare,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -87,14 +87,11 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
else {
|
||||
return;
|
||||
};
|
||||
if let Err(err) = self.check_and_cleanup_region(region_id, &request).await {
|
||||
sender.send(Err(err));
|
||||
return;
|
||||
}
|
||||
info!("Try to open region {}, worker: {}", region_id, self.id);
|
||||
sanitize_open_request_options(&mut request.options);
|
||||
|
||||
// Open region from specific region dir.
|
||||
let requirements = request.requirements;
|
||||
let opener = match RegionOpener::new(
|
||||
region_id,
|
||||
&request.table_dir,
|
||||
@@ -112,7 +109,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
.cache(Some(self.cache_manager.clone()))
|
||||
.wal_entry_reader(wal_entry_receiver.map(|receiver| Box::new(receiver) as _))
|
||||
.replay_checkpoint(request.checkpoint.map(|checkpoint| checkpoint.entry_id))
|
||||
.parse_options(request.options)
|
||||
.parse_options(request.options.clone())
|
||||
{
|
||||
Ok(opener) => opener,
|
||||
Err(err) => {
|
||||
@@ -121,6 +118,16 @@ impl<S: LogStore> RegionWorkerLoop<S> {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(err) = opener.ensure_open_requirements(requirements) {
|
||||
sender.send(Err(err));
|
||||
return;
|
||||
}
|
||||
|
||||
if let Err(err) = self.check_and_cleanup_region(region_id, &request).await {
|
||||
sender.send(Err(err));
|
||||
return;
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
let regions = self.regions.clone();
|
||||
let wal = self.wal.clone();
|
||||
|
||||
@@ -22,11 +22,17 @@ use opendal::layers::{
|
||||
LoggingInterceptor, LoggingLayer, RetryEvent, RetryInterceptor, RetryLayer, TracingLayer,
|
||||
};
|
||||
use opendal::raw::{AccessorInfo, HttpClient, Operation};
|
||||
use opendal::services::FS_SCHEME;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::config::HttpClientConfig;
|
||||
use crate::{ObjectStore, error};
|
||||
|
||||
/// Returns true if the object store is not backed by local filesystem.
|
||||
pub fn is_object_storage(object_store: &ObjectStore) -> bool {
|
||||
object_store.info().scheme() != FS_SCHEME
|
||||
}
|
||||
|
||||
/// Join two paths and normalize the output dir.
|
||||
///
|
||||
/// The output dir is always ends with `/`. e.g.
|
||||
@@ -249,7 +255,11 @@ impl RetryInterceptor for PrintDetailedError {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use opendal::services::Fs;
|
||||
|
||||
use super::*;
|
||||
use crate::ObjectStore;
|
||||
use crate::util::is_object_storage;
|
||||
|
||||
#[test]
|
||||
fn test_normalize_dir() {
|
||||
@@ -289,4 +299,14 @@ mod tests {
|
||||
assert_eq!("/abc", join_path("//", "/abc"));
|
||||
assert_eq!("abc/def", join_path("abc/", "//def"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fs_is_not_object_storage() {
|
||||
let object_store = ObjectStore::new(Fs::default().root("/tmp"))
|
||||
.unwrap()
|
||||
.finish();
|
||||
|
||||
assert_eq!(FS_SCHEME, object_store.info().scheme());
|
||||
assert!(!is_object_storage(&object_store));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,11 +15,15 @@
|
||||
use std::collections::HashMap;
|
||||
use std::future::Future;
|
||||
use std::path::Path;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
|
||||
use client::{Output, OutputData, OutputMeta};
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_datasource::file_format::csv::CsvFormat;
|
||||
use common_datasource::file_format::csv::{
|
||||
CsvFormat, is_skippable_arrow_error, tolerant_csv_stream,
|
||||
};
|
||||
use common_datasource::file_format::json::JsonFormat;
|
||||
use common_datasource::file_format::orc::{ReaderAdapter, infer_orc_schema, new_orc_stream_reader};
|
||||
use common_datasource::file_format::{FileFormat, Format, file_to_stream};
|
||||
@@ -33,10 +37,13 @@ use common_telemetry::{debug, tracing};
|
||||
use datafusion::datasource::physical_plan::{CsvSource, FileSource, JsonSource};
|
||||
use datafusion::parquet::arrow::ParquetRecordBatchStreamBuilder;
|
||||
use datafusion::parquet::arrow::arrow_reader::ArrowReaderMetadata;
|
||||
use datafusion_common::DataFusionError;
|
||||
use datafusion_common::arrow::error::ArrowError;
|
||||
use datafusion_common::config::CsvOptions;
|
||||
use datafusion_expr::Expr;
|
||||
use datatypes::arrow::compute::can_cast_types;
|
||||
use datatypes::arrow::datatypes::{DataType as ArrowDataType, Schema, SchemaRef};
|
||||
use datatypes::arrow::record_batch::RecordBatch;
|
||||
use datatypes::vectors::Helper;
|
||||
use futures_util::StreamExt;
|
||||
use object_store::{Entry, EntryMode, ObjectStore};
|
||||
@@ -221,23 +228,42 @@ impl StatementExecutor {
|
||||
let csv_source = CsvSource::new(schema.clone())
|
||||
.with_csv_options(options)
|
||||
.with_batch_size(DEFAULT_BATCH_SIZE);
|
||||
let stream = file_to_stream(
|
||||
object_store,
|
||||
path,
|
||||
csv_source,
|
||||
Some(projection),
|
||||
format.compression_type,
|
||||
)
|
||||
.await
|
||||
.context(error::BuildFileStreamSnafu)?;
|
||||
let stream = if format.skip_bad_records {
|
||||
let reader_schema =
|
||||
csv_reader_schema_for_skip_bad_records(schema, &compat_schema);
|
||||
tolerant_csv_stream(
|
||||
object_store,
|
||||
path,
|
||||
Arc::new(reader_schema),
|
||||
projection.clone(),
|
||||
format,
|
||||
)
|
||||
.await
|
||||
.context(error::BuildFileStreamSnafu)?
|
||||
} else {
|
||||
file_to_stream(
|
||||
object_store,
|
||||
path,
|
||||
csv_source,
|
||||
Some(projection),
|
||||
format.compression_type,
|
||||
)
|
||||
.await
|
||||
.context(error::BuildFileStreamSnafu)?
|
||||
};
|
||||
|
||||
Ok(Box::pin(
|
||||
let stream = Box::pin(
|
||||
// The projection is already applied in the CSV reader when we created the stream,
|
||||
// so we pass None here to avoid double projection which would cause schema mismatch errors.
|
||||
RecordBatchStreamTypeAdapter::new(output_schema, stream, None)
|
||||
.with_filter(filters)
|
||||
.context(error::PhysicalExprSnafu)?,
|
||||
))
|
||||
);
|
||||
if format.skip_bad_records {
|
||||
Ok(Box::pin(SkipBadRecordsStream::new(stream, path)))
|
||||
} else {
|
||||
Ok(stream)
|
||||
}
|
||||
}
|
||||
FileMetadata::Json {
|
||||
path,
|
||||
@@ -469,6 +495,58 @@ fn gen_insert_output(rows_inserted: usize, insert_cost: usize) -> Output {
|
||||
)
|
||||
}
|
||||
|
||||
struct SkipBadRecordsStream {
|
||||
inner: DfSendableRecordBatchStream,
|
||||
path: String,
|
||||
}
|
||||
|
||||
impl SkipBadRecordsStream {
|
||||
fn new(inner: DfSendableRecordBatchStream, path: impl Into<String>) -> Self {
|
||||
Self {
|
||||
inner,
|
||||
path: path.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl datafusion::physical_plan::RecordBatchStream for SkipBadRecordsStream {
|
||||
fn schema(&self) -> SchemaRef {
|
||||
self.inner.schema()
|
||||
}
|
||||
}
|
||||
|
||||
impl futures::Stream for SkipBadRecordsStream {
|
||||
type Item = datafusion_common::Result<RecordBatch>;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
let this = self.get_mut();
|
||||
loop {
|
||||
match this.inner.as_mut().poll_next(cx) {
|
||||
Poll::Ready(Some(Err(error))) if is_skippable_record_error(&error) => {
|
||||
common_telemetry::warn!(
|
||||
"Skipping bad record while copying from {}: {}",
|
||||
this.path,
|
||||
error
|
||||
);
|
||||
continue;
|
||||
}
|
||||
other => return other,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_skippable_record_error(error: &DataFusionError) -> bool {
|
||||
match error {
|
||||
DataFusionError::ArrowError(error, _) => is_skippable_arrow_error(error),
|
||||
DataFusionError::External(error) => error
|
||||
.downcast_ref::<ArrowError>()
|
||||
.is_some_and(is_skippable_arrow_error),
|
||||
DataFusionError::Context(_, error) => is_skippable_record_error(error),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes all pending inserts all at once, drain pending requests and reset pending bytes.
|
||||
async fn batch_insert(
|
||||
pending: &mut Vec<impl Future<Output = Result<Output>>>,
|
||||
@@ -498,6 +576,59 @@ fn can_cast_types_for_greptime(from: &ArrowDataType, to: &ArrowDataType) -> bool
|
||||
can_cast_types(from, to)
|
||||
}
|
||||
|
||||
fn csv_reader_schema_for_skip_bad_records(file: &SchemaRef, compat: &SchemaRef) -> Schema {
|
||||
let fields = file
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(idx, file_field)| {
|
||||
let compat_field = compat
|
||||
.fields()
|
||||
.find(file_field.name())
|
||||
.map(|(_, field)| field);
|
||||
|
||||
match compat_field {
|
||||
Some(compat_field) if can_csv_reader_parse_type(compat_field.data_type()) => {
|
||||
compat_field.clone()
|
||||
}
|
||||
_ => file.fields()[idx].clone(),
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
Schema::new_with_metadata(fields, file.metadata().clone())
|
||||
}
|
||||
|
||||
fn can_csv_reader_parse_type(data_type: &ArrowDataType) -> bool {
|
||||
match data_type {
|
||||
ArrowDataType::Boolean
|
||||
| ArrowDataType::Decimal32(_, _)
|
||||
| ArrowDataType::Decimal64(_, _)
|
||||
| ArrowDataType::Decimal128(_, _)
|
||||
| ArrowDataType::Decimal256(_, _)
|
||||
| ArrowDataType::Int8
|
||||
| ArrowDataType::Int16
|
||||
| ArrowDataType::Int32
|
||||
| ArrowDataType::Int64
|
||||
| ArrowDataType::UInt8
|
||||
| ArrowDataType::UInt16
|
||||
| ArrowDataType::UInt32
|
||||
| ArrowDataType::UInt64
|
||||
| ArrowDataType::Float32
|
||||
| ArrowDataType::Float64
|
||||
| ArrowDataType::Date32
|
||||
| ArrowDataType::Date64
|
||||
| ArrowDataType::Time32(_)
|
||||
| ArrowDataType::Time64(_)
|
||||
| ArrowDataType::Timestamp(_, _)
|
||||
| ArrowDataType::Null
|
||||
| ArrowDataType::Utf8
|
||||
| ArrowDataType::Utf8View => true,
|
||||
ArrowDataType::Dictionary(_, value_type) => value_type.as_ref() == &ArrowDataType::Utf8,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn ensure_schema_compatible(from: &SchemaRef, to: &SchemaRef) -> Result<()> {
|
||||
let not_match = from
|
||||
.fields
|
||||
@@ -780,4 +911,31 @@ mod tests {
|
||||
assert_eq!(test.0.project(&fp).unwrap(), test.1.project(&tp).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_csv_reader_schema_for_skip_bad_records() {
|
||||
let file_schema = make_test_schema(&[
|
||||
Field::new("id", DataType::Utf8, true),
|
||||
Field::new("jsons", DataType::Utf8, true),
|
||||
Field::new("ts", DataType::Utf8, true),
|
||||
]);
|
||||
let compat_schema = make_test_schema(&[
|
||||
Field::new("id", DataType::UInt32, true),
|
||||
Field::new("jsons", DataType::Binary, true),
|
||||
Field::new(
|
||||
"ts",
|
||||
DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
|
||||
true,
|
||||
),
|
||||
]);
|
||||
|
||||
let reader_schema = csv_reader_schema_for_skip_bad_records(&file_schema, &compat_schema);
|
||||
|
||||
assert_eq!(reader_schema.field(0).data_type(), &DataType::UInt32);
|
||||
assert_eq!(reader_schema.field(1).data_type(), &DataType::Utf8);
|
||||
assert_eq!(
|
||||
reader_schema.field(2).data_type(),
|
||||
compat_schema.field(2).data_type()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -233,6 +233,36 @@ transform:
|
||||
parse(&Content::Yaml(pipeline_yaml)).unwrap()
|
||||
}
|
||||
|
||||
fn prepare_vrl_pipeline() -> Pipeline {
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Minimal VRL processor benchmark
|
||||
|
||||
processors:
|
||||
- vrl:
|
||||
source: |
|
||||
.service_alias = .service
|
||||
.host_alias = .host
|
||||
del(.unused)
|
||||
.processed = true
|
||||
.
|
||||
|
||||
transform:
|
||||
- field: service
|
||||
type: string
|
||||
- field: host
|
||||
type: string
|
||||
- field: service_alias
|
||||
type: string
|
||||
- field: host_alias
|
||||
type: string
|
||||
- field: processed
|
||||
type: boolean
|
||||
"#;
|
||||
|
||||
parse(&Content::Yaml(pipeline_yaml)).unwrap()
|
||||
}
|
||||
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
let input_value_str = include_str!("./data.log");
|
||||
let input_value = Deserializer::from_str(input_value_str)
|
||||
@@ -262,6 +292,41 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
|
||||
let vrl_input_value = (0..128)
|
||||
.map(|i| {
|
||||
serde_json::json!({
|
||||
"service": "frontend",
|
||||
"host": format!("host-{i}"),
|
||||
"unused": "drop-me"
|
||||
})
|
||||
.into()
|
||||
})
|
||||
.collect::<Vec<VrlValue>>();
|
||||
let vrl_pipeline = prepare_vrl_pipeline();
|
||||
|
||||
let (vrl_pipeline, mut vrl_schema_info, vrl_pipeline_def, vrl_pipeline_param) =
|
||||
setup_pipeline!(vrl_pipeline);
|
||||
let vrl_pipeline_ctx = PipelineContext::new(
|
||||
&vrl_pipeline_def,
|
||||
&vrl_pipeline_param,
|
||||
session::context::Channel::Unknown,
|
||||
);
|
||||
|
||||
let mut group = c.benchmark_group("vrl processor");
|
||||
group.sample_size(50);
|
||||
group.bench_function("processor mut", |b| {
|
||||
b.iter(|| {
|
||||
processor_mut(
|
||||
black_box(vrl_pipeline.clone()),
|
||||
black_box(&vrl_pipeline_ctx),
|
||||
black_box(&mut vrl_schema_info),
|
||||
black_box(vrl_input_value.clone()),
|
||||
)
|
||||
.unwrap();
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// Testing the pipeline's performance in converting Json to Rows
|
||||
|
||||
@@ -12,9 +12,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use chrono_tz::Tz;
|
||||
use once_cell::sync::Lazy;
|
||||
use snafu::{OptionExt, ensure};
|
||||
use vrl::compiler::runtime::Runtime;
|
||||
use vrl::compiler::{Program, TargetValue, compile};
|
||||
@@ -31,6 +33,12 @@ use crate::etl::processor::yaml_string;
|
||||
pub(crate) const PROCESSOR_VRL: &str = "vrl";
|
||||
const SOURCE: &str = "source";
|
||||
|
||||
static UTC_TIMEZONE: Lazy<TimeZone> = Lazy::new(|| TimeZone::Named(Tz::UTC));
|
||||
|
||||
thread_local! {
|
||||
static VRL_RUNTIME: RefCell<Runtime> = RefCell::new(Runtime::default());
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct VrlProcessor {
|
||||
source: String,
|
||||
@@ -74,10 +82,14 @@ impl VrlProcessor {
|
||||
secrets: Secrets::default(),
|
||||
};
|
||||
|
||||
let timezone = TimeZone::Named(Tz::UTC);
|
||||
let mut runtime = Runtime::default();
|
||||
let re = runtime
|
||||
.resolve(&mut target, &self.program, &timezone)
|
||||
let re = VRL_RUNTIME
|
||||
.with(|runtime| {
|
||||
let mut runtime = runtime.borrow_mut();
|
||||
runtime.clear();
|
||||
let result = runtime.resolve(&mut target, &self.program, &UTC_TIMEZONE);
|
||||
runtime.clear();
|
||||
result
|
||||
})
|
||||
.map_err(|e| {
|
||||
ExecuteVrlSnafu {
|
||||
msg: e.get_expression_error().to_string(),
|
||||
|
||||
@@ -14,25 +14,11 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::Router as HttpRouter;
|
||||
use common_error::ext::BoxedError;
|
||||
use tonic::transport::server::Router as GrpcRouter;
|
||||
|
||||
use crate::grpc::builder::GrpcServerBuilder;
|
||||
|
||||
/// A configurator that customizes or enhances an HTTP router.
|
||||
#[async_trait::async_trait]
|
||||
pub trait HttpConfigurator<C>: Send + Sync {
|
||||
/// Configures the given HTTP router using the provided context.
|
||||
async fn configure_http(
|
||||
&self,
|
||||
route: HttpRouter,
|
||||
ctx: C,
|
||||
) -> std::result::Result<HttpRouter, BoxedError>;
|
||||
}
|
||||
|
||||
pub type HttpConfiguratorRef<C> = Arc<dyn HttpConfigurator<C>>;
|
||||
|
||||
/// A configurator that customizes or enhances a gRPC router.
|
||||
#[async_trait::async_trait]
|
||||
pub trait GrpcRouterConfigurator<C>: Send + Sync {
|
||||
|
||||
@@ -24,7 +24,7 @@ pub mod prom_query_gateway;
|
||||
pub mod region_server;
|
||||
|
||||
use std::any::Any;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
use std::time::Duration;
|
||||
|
||||
use api::v1::health_check_server::{HealthCheck, HealthCheckServer};
|
||||
@@ -95,14 +95,8 @@ impl GrpcOptions {
|
||||
if self.server_addr.is_empty() {
|
||||
match local_ip_address::local_ip() {
|
||||
Ok(ip) => {
|
||||
let detected_addr = format!(
|
||||
"{}:{}",
|
||||
ip,
|
||||
self.bind_addr
|
||||
.split(':')
|
||||
.nth(1)
|
||||
.unwrap_or(DEFAULT_GRPC_ADDR_PORT)
|
||||
);
|
||||
let port = port_from_bind_addr(&self.bind_addr);
|
||||
let detected_addr = format_server_addr(ip, port);
|
||||
info!("Using detected: {} as server address", detected_addr);
|
||||
self.server_addr = detected_addr;
|
||||
}
|
||||
@@ -131,7 +125,18 @@ impl GrpcOptions {
|
||||
}
|
||||
}
|
||||
|
||||
const DEFAULT_GRPC_ADDR_PORT: &str = "4001";
|
||||
const DEFAULT_GRPC_ADDR_PORT: u16 = 4001;
|
||||
|
||||
fn port_from_bind_addr(bind_addr: &str) -> u16 {
|
||||
bind_addr
|
||||
.rsplit_once(':')
|
||||
.and_then(|(_, port)| port.parse().ok())
|
||||
.unwrap_or(DEFAULT_GRPC_ADDR_PORT)
|
||||
}
|
||||
|
||||
fn format_server_addr(ip: IpAddr, port: u16) -> String {
|
||||
SocketAddr::new(ip, port).to_string()
|
||||
}
|
||||
|
||||
const DEFAULT_INTERNAL_GRPC_ADDR_PORT: &str = "4010";
|
||||
|
||||
@@ -415,3 +420,36 @@ impl Server for GrpcServer {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||||
|
||||
use super::{DEFAULT_GRPC_ADDR_PORT, format_server_addr, port_from_bind_addr};
|
||||
|
||||
#[test]
|
||||
fn test_port_from_bind_addr() {
|
||||
assert_eq!(3002, port_from_bind_addr("127.0.0.1:3002"));
|
||||
assert_eq!(3002, port_from_bind_addr("[::]:3002"));
|
||||
assert_eq!(
|
||||
3002,
|
||||
port_from_bind_addr("greptimedb-metasrv.default.svc.cluster.local:3002")
|
||||
);
|
||||
assert_eq!(
|
||||
DEFAULT_GRPC_ADDR_PORT,
|
||||
port_from_bind_addr("invalid-bind-addr")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_server_addr() {
|
||||
assert_eq!(
|
||||
"127.0.0.1:3002",
|
||||
format_server_addr(IpAddr::V4(Ipv4Addr::LOCALHOST), 3002)
|
||||
);
|
||||
assert_eq!(
|
||||
"[::1]:3002",
|
||||
format_server_addr(IpAddr::V6(Ipv6Addr::LOCALHOST), 3002)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,6 @@ use axum::response::{IntoResponse, Response};
|
||||
use axum::routing::Route;
|
||||
use axum::serve::ListenerExt;
|
||||
use axum::{Router, middleware, routing};
|
||||
use common_base::Plugins;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
use common_recordbatch::RecordBatch;
|
||||
use common_telemetry::{error, info};
|
||||
@@ -52,11 +51,9 @@ use tower_http::trace::TraceLayer;
|
||||
|
||||
use self::authorize::AuthState;
|
||||
use self::result::table_result::TableResponse;
|
||||
use crate::configurator::HttpConfiguratorRef;
|
||||
use crate::elasticsearch;
|
||||
use crate::error::{
|
||||
AddressBindSnafu, AlreadyStartedSnafu, Error, InternalIoSnafu, InvalidHeaderValueSnafu,
|
||||
OtherSnafu, Result,
|
||||
AddressBindSnafu, AlreadyStartedSnafu, Error, InternalIoSnafu, InvalidHeaderValueSnafu, Result,
|
||||
};
|
||||
use crate::http::influxdb::{influxdb_health, influxdb_ping, influxdb_write_v1, influxdb_write_v2};
|
||||
use crate::http::otlp::OtlpState;
|
||||
@@ -139,9 +136,6 @@ pub struct HttpServer {
|
||||
user_provider: Option<UserProviderRef>,
|
||||
memory_limiter: ServerMemoryLimiter,
|
||||
|
||||
// plugins
|
||||
plugins: Plugins,
|
||||
|
||||
// server configs
|
||||
options: HttpOptions,
|
||||
bind_addr: Option<SocketAddr>,
|
||||
@@ -516,7 +510,6 @@ pub struct DashboardState {
|
||||
|
||||
pub struct HttpServerBuilder {
|
||||
options: HttpOptions,
|
||||
plugins: Plugins,
|
||||
user_provider: Option<UserProviderRef>,
|
||||
router: Router,
|
||||
memory_limiter: ServerMemoryLimiter,
|
||||
@@ -526,7 +519,6 @@ impl HttpServerBuilder {
|
||||
pub fn new(options: HttpOptions) -> Self {
|
||||
Self {
|
||||
options,
|
||||
plugins: Plugins::default(),
|
||||
user_provider: None,
|
||||
router: Router::new(),
|
||||
memory_limiter: ServerMemoryLimiter::default(),
|
||||
@@ -687,10 +679,6 @@ impl HttpServerBuilder {
|
||||
Self { router, ..self }
|
||||
}
|
||||
|
||||
pub fn with_plugins(self, plugins: Plugins) -> Self {
|
||||
Self { plugins, ..self }
|
||||
}
|
||||
|
||||
pub fn with_greptime_config_options(self, opts: String) -> Self {
|
||||
let config_router = HttpServer::route_config(GreptimeOptionsConfigState {
|
||||
greptime_config_options: opts,
|
||||
@@ -748,7 +736,6 @@ impl HttpServerBuilder {
|
||||
options: self.options,
|
||||
user_provider: self.user_provider,
|
||||
shutdown_tx: Mutex::new(None),
|
||||
plugins: self.plugins,
|
||||
router: StdMutex::new(self.router),
|
||||
bind_addr: None,
|
||||
memory_limiter: self.memory_limiter,
|
||||
@@ -1237,14 +1224,7 @@ impl Server for HttpServer {
|
||||
AlreadyStartedSnafu { server: "HTTP" }
|
||||
);
|
||||
|
||||
let mut app = self.make_app();
|
||||
if let Some(configurator) = self.plugins.get::<HttpConfiguratorRef<()>>() {
|
||||
app = configurator
|
||||
.configure_http(app, ())
|
||||
.await
|
||||
.context(OtherSnafu)?;
|
||||
}
|
||||
let app = self.build(app)?;
|
||||
let app = self.build(self.make_app())?;
|
||||
let listener = tokio::net::TcpListener::bind(listening)
|
||||
.await
|
||||
.context(AddressBindSnafu { addr: listening })?
|
||||
|
||||
@@ -401,6 +401,28 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_copy_table_from_csv_options() {
|
||||
let sql =
|
||||
"COPY my_table FROM '/tmp/test.csv' WITH (FORMAT = 'CSV', SKIP_BAD_RECORDS = 'false')";
|
||||
let mut result =
|
||||
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
|
||||
.unwrap();
|
||||
assert_eq!(1, result.len());
|
||||
|
||||
let statement = result.remove(0);
|
||||
assert_matches!(statement, Statement::Copy { .. });
|
||||
match statement {
|
||||
Statement::Copy(crate::statements::copy::Copy::CopyTable(CopyTable::From(
|
||||
copy_table,
|
||||
))) => {
|
||||
assert_eq!(copy_table.with.get("format"), Some("CSV"));
|
||||
assert_eq!(copy_table.with.get("skip_bad_records"), Some("false"));
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_copy_table_to() {
|
||||
struct Test<'a> {
|
||||
|
||||
@@ -27,7 +27,7 @@ use serde::Serialize;
|
||||
use snafu::ensure;
|
||||
use sqlparser::ast::{
|
||||
Array, Expr, Ident, ObjectName, ObjectNamePart, SetExpr, SqlOption, StructField, TableFactor,
|
||||
Value, ValueWithSpan,
|
||||
TableWithJoins, Value, ValueWithSpan,
|
||||
};
|
||||
use sqlparser_derive::{Visit, VisitMut};
|
||||
|
||||
@@ -195,7 +195,7 @@ pub fn extract_tables_from_query(query: &SqlOrTql) -> impl Iterator<Item = Objec
|
||||
|
||||
match query {
|
||||
SqlOrTql::Sql(query, _) => {
|
||||
extract_tables_from_set_expr(&query.inner.body, &mut names);
|
||||
extract_tables_from_sql_query(&query.inner, &mut names);
|
||||
extract_tables_from_hybrid_cte_query(query, &mut names);
|
||||
}
|
||||
SqlOrTql::Tql(tql, _) => extract_tables_from_tql(tql, &mut names),
|
||||
@@ -205,26 +205,34 @@ pub fn extract_tables_from_query(query: &SqlOrTql) -> impl Iterator<Item = Objec
|
||||
}
|
||||
|
||||
fn extract_tables_from_hybrid_cte_query(query: &Query, sql_names: &mut HashSet<ObjectName>) {
|
||||
let mut tql_names = HashSet::new();
|
||||
let mut cte_names: HashSet<String> = HashSet::new();
|
||||
if let Some(hybrid_cte) = &query.hybrid_cte {
|
||||
let mut cte_names: HashSet<String> = hybrid_cte
|
||||
.cte_tables
|
||||
.iter()
|
||||
.map(|cte| ParserContext::canonicalize_identifier(cte.name.clone()).value)
|
||||
.collect();
|
||||
remove_cte_names(sql_names, &cte_names);
|
||||
|
||||
cte_names.clear();
|
||||
for cte in &hybrid_cte.cte_tables {
|
||||
cte_names.insert(ParserContext::canonicalize_identifier(cte.name.clone()).value);
|
||||
if let CteContent::Tql(tql) = &cte.content {
|
||||
extract_tables_from_tql(tql, &mut tql_names);
|
||||
let cte_name = ParserContext::canonicalize_identifier(cte.name.clone()).value;
|
||||
let mut cte_query_names = HashSet::new();
|
||||
match &cte.content {
|
||||
CteContent::Sql(cte_query) => {
|
||||
extract_tables_from_sql_query(cte_query, &mut cte_query_names)
|
||||
}
|
||||
CteContent::Tql(tql) => extract_tables_from_tql(tql, &mut cte_query_names),
|
||||
}
|
||||
if hybrid_cte.recursive {
|
||||
cte_names.insert(cte_name.clone());
|
||||
}
|
||||
remove_cte_names(&mut cte_query_names, &cte_names);
|
||||
sql_names.extend(cte_query_names);
|
||||
if !hybrid_cte.recursive {
|
||||
cte_names.insert(cte_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(with) = &query.inner.with {
|
||||
for cte in &with.cte_tables {
|
||||
cte_names.insert(ParserContext::canonicalize_identifier(cte.alias.name.clone()).value);
|
||||
}
|
||||
}
|
||||
|
||||
remove_cte_names(sql_names, &cte_names);
|
||||
|
||||
sql_names.extend(tql_names);
|
||||
}
|
||||
|
||||
fn remove_cte_names(names: &mut HashSet<ObjectName>, cte_names: &HashSet<String>) {
|
||||
@@ -339,6 +347,33 @@ pub fn location_to_index(sql: &str, location: &sqlparser::tokenizer::Location) -
|
||||
index - 1
|
||||
}
|
||||
|
||||
/// Helper function for [extract_tables_from_query].
|
||||
///
|
||||
/// Handle [sqlparser::ast::Query].
|
||||
fn extract_tables_from_sql_query(query: &sqlparser::ast::Query, names: &mut HashSet<ObjectName>) {
|
||||
let mut cte_names = HashSet::new();
|
||||
if let Some(with) = &query.with {
|
||||
for cte in &with.cte_tables {
|
||||
let cte_name = ParserContext::canonicalize_identifier(cte.alias.name.clone()).value;
|
||||
let mut cte_query_names = HashSet::new();
|
||||
extract_tables_from_sql_query(&cte.query, &mut cte_query_names);
|
||||
if with.recursive {
|
||||
cte_names.insert(cte_name.clone());
|
||||
}
|
||||
remove_cte_names(&mut cte_query_names, &cte_names);
|
||||
names.extend(cte_query_names);
|
||||
if !with.recursive {
|
||||
cte_names.insert(cte_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut body_names = HashSet::new();
|
||||
extract_tables_from_set_expr(&query.body, &mut body_names);
|
||||
remove_cte_names(&mut body_names, &cte_names);
|
||||
names.extend(body_names);
|
||||
}
|
||||
|
||||
/// Helper function for [extract_tables_from_query].
|
||||
///
|
||||
/// Handle [SetExpr].
|
||||
@@ -346,14 +381,11 @@ fn extract_tables_from_set_expr(set_expr: &SetExpr, names: &mut HashSet<ObjectNa
|
||||
match set_expr {
|
||||
SetExpr::Select(select) => {
|
||||
for from in &select.from {
|
||||
table_factor_to_object_name(&from.relation, names);
|
||||
for join in &from.joins {
|
||||
table_factor_to_object_name(&join.relation, names);
|
||||
}
|
||||
extract_tables_from_table_with_joins(from, names);
|
||||
}
|
||||
}
|
||||
SetExpr::Query(query) => {
|
||||
extract_tables_from_set_expr(&query.body, names);
|
||||
extract_tables_from_sql_query(query, names);
|
||||
}
|
||||
SetExpr::SetOperation { left, right, .. } => {
|
||||
extract_tables_from_set_expr(left, names);
|
||||
@@ -363,12 +395,47 @@ fn extract_tables_from_set_expr(set_expr: &SetExpr, names: &mut HashSet<ObjectNa
|
||||
};
|
||||
}
|
||||
|
||||
/// Helper function for [extract_tables_from_query].
|
||||
///
|
||||
/// Handle [TableWithJoins].
|
||||
fn extract_tables_from_table_with_joins(
|
||||
table_with_joins: &TableWithJoins,
|
||||
names: &mut HashSet<ObjectName>,
|
||||
) {
|
||||
table_factor_to_object_name(&table_with_joins.relation, names);
|
||||
for join in &table_with_joins.joins {
|
||||
table_factor_to_object_name(&join.relation, names);
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function for [extract_tables_from_query].
|
||||
///
|
||||
/// Handle [TableFactor].
|
||||
fn table_factor_to_object_name(table_factor: &TableFactor, names: &mut HashSet<ObjectName>) {
|
||||
if let TableFactor::Table { name, .. } = table_factor {
|
||||
names.insert(name.to_owned());
|
||||
match table_factor {
|
||||
TableFactor::Table { name, .. } => {
|
||||
names.insert(name.to_owned());
|
||||
}
|
||||
TableFactor::Derived { subquery, .. } => {
|
||||
extract_tables_from_sql_query(subquery, names);
|
||||
}
|
||||
TableFactor::NestedJoin {
|
||||
table_with_joins, ..
|
||||
} => {
|
||||
extract_tables_from_table_with_joins(table_with_joins, names);
|
||||
}
|
||||
TableFactor::Pivot { table, .. }
|
||||
| TableFactor::Unpivot { table, .. }
|
||||
| TableFactor::MatchRecognize { table, .. } => {
|
||||
table_factor_to_object_name(table, names);
|
||||
}
|
||||
TableFactor::TableFunction { .. }
|
||||
| TableFactor::Function { .. }
|
||||
| TableFactor::UNNEST { .. }
|
||||
| TableFactor::JsonTable { .. }
|
||||
| TableFactor::OpenJsonTable { .. }
|
||||
| TableFactor::XmlTable { .. }
|
||||
| TableFactor::SemanticView { .. } => {}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -458,6 +525,91 @@ TQL EVAL (now() - '15s'::interval, now(), '5s') count_values("status_code", {__n
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_tables_from_sql_query_with_derived_join() {
|
||||
let sql = r#"
|
||||
CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
|
||||
EVAL INTERVAL '1m' AS
|
||||
SELECT a.symbol, b.mark_price
|
||||
FROM (
|
||||
SELECT inst_id AS symbol, max(ts) AS mark_iv_ts
|
||||
FROM flow_batch_join_opt_summary
|
||||
GROUP BY inst_id
|
||||
) a
|
||||
LEFT JOIN (
|
||||
SELECT symbol, max(mark_price) AS mark_price
|
||||
FROM flow_batch_join_market_v5
|
||||
WHERE "type" = 'OPTION_MARK'
|
||||
GROUP BY symbol
|
||||
) b ON a.symbol = b.symbol;
|
||||
"#;
|
||||
let mut stmts =
|
||||
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
|
||||
.unwrap();
|
||||
let Statement::CreateFlow(create_flow) = stmts.pop().unwrap() else {
|
||||
unreachable!()
|
||||
};
|
||||
|
||||
let mut tables = extract_tables_from_query(&create_flow.query)
|
||||
.map(|table| format_raw_object_name(&table))
|
||||
.collect_vec();
|
||||
tables.sort();
|
||||
assert_eq!(
|
||||
vec![
|
||||
"flow_batch_join_market_v5".to_string(),
|
||||
"flow_batch_join_opt_summary".to_string(),
|
||||
],
|
||||
tables
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_tables_from_sql_query_with_cte_scopes() {
|
||||
let testcases = vec![
|
||||
(
|
||||
r#"
|
||||
WITH source AS (
|
||||
SELECT * FROM source
|
||||
)
|
||||
SELECT * FROM source;
|
||||
"#,
|
||||
vec!["source".to_string()],
|
||||
),
|
||||
(
|
||||
r#"
|
||||
WITH first_cte AS (
|
||||
SELECT * FROM physical_source
|
||||
), second_cte AS (
|
||||
SELECT * FROM first_cte
|
||||
)
|
||||
SELECT * FROM second_cte;
|
||||
"#,
|
||||
vec!["physical_source".to_string()],
|
||||
),
|
||||
];
|
||||
|
||||
for (sql, expected_tables) in testcases {
|
||||
let mut stmts = ParserContext::create_with_dialect(
|
||||
sql,
|
||||
&GreptimeDbDialect {},
|
||||
ParseOptions::default(),
|
||||
)
|
||||
.unwrap();
|
||||
let Statement::Query(query) = stmts.pop().unwrap() else {
|
||||
unreachable!()
|
||||
};
|
||||
|
||||
let mut tables = HashSet::new();
|
||||
extract_tables_from_sql_query(&query.inner, &mut tables);
|
||||
let mut tables = tables
|
||||
.into_iter()
|
||||
.map(|table| format_raw_object_name(&table))
|
||||
.collect_vec();
|
||||
tables.sort();
|
||||
assert_eq!(expected_tables, tables);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_tables_from_tql_query_with_schema_matcher() {
|
||||
let sql = r#"
|
||||
|
||||
@@ -315,6 +315,7 @@ fn make_region_open(open: OpenRequest) -> Result<Vec<(RegionId, RegionRequest)>>
|
||||
options: open.options,
|
||||
skip_wal_replay: false,
|
||||
checkpoint: None,
|
||||
requirements: Default::default(),
|
||||
}),
|
||||
)])
|
||||
}
|
||||
@@ -566,6 +567,28 @@ pub struct RegionDropRequest {
|
||||
pub partial_drop: bool,
|
||||
}
|
||||
|
||||
/// Requirements for a region request.
|
||||
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct RegionRequirements {
|
||||
/// Whether the region data must be backed by object storage.
|
||||
pub object_storage: bool,
|
||||
}
|
||||
|
||||
impl RegionRequirements {
|
||||
/// Returns empty requirements.
|
||||
pub fn empty() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Returns requirements for object storage.
|
||||
pub fn object_storage() -> Self {
|
||||
Self {
|
||||
object_storage: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Open region request.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RegionOpenRequest {
|
||||
@@ -581,6 +604,8 @@ pub struct RegionOpenRequest {
|
||||
pub skip_wal_replay: bool,
|
||||
/// Replay checkpoint.
|
||||
pub checkpoint: Option<ReplayCheckpoint>,
|
||||
/// Requirements for opening the region.
|
||||
pub requirements: RegionRequirements,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
|
||||
@@ -63,7 +63,7 @@ log-query = { workspace = true }
|
||||
loki-proto.workspace = true
|
||||
meta-client.workspace = true
|
||||
meta-srv = { workspace = true, features = ["mock"] }
|
||||
mito2.workspace = true
|
||||
mito2 = { workspace = true, features = ["test-shared-fs-region-migration"] }
|
||||
object-store.workspace = true
|
||||
operator = { workspace = true, features = ["testing"] }
|
||||
plugins.workspace = true
|
||||
|
||||
@@ -183,6 +183,24 @@ select * from csv_null_prefix_import;
|
||||
| final | 2023-11-14T22:13:23 |
|
||||
+-------+---------------------+
|
||||
|
||||
CREATE TABLE csv_skip_bad_records(host_id int, host_name string, reading_value double, ts timestamp time index);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- SQLNESS ENV PWD
|
||||
Copy csv_skip_bad_records FROM '$PWD/tests/data/csv/skip_bad_records.csv' WITH (format='csv', skip_bad_records='true');
|
||||
|
||||
Affected Rows: 2
|
||||
|
||||
select * from csv_skip_bad_records order by ts;
|
||||
|
||||
+---------+-----------+---------------+---------------------+
|
||||
| host_id | host_name | reading_value | ts |
|
||||
+---------+-----------+---------------+---------------------+
|
||||
| 1 | Alice | 10.5 | 2024-01-01T00:00:00 |
|
||||
| 2 | Bob | 30.5 | 2024-01-01T00:00:02 |
|
||||
+---------+-----------+---------------+---------------------+
|
||||
|
||||
drop table demo;
|
||||
|
||||
Affected Rows: 0
|
||||
@@ -219,3 +237,7 @@ drop table csv_null_prefix_import;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
drop table csv_skip_bad_records;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
|
||||
@@ -73,6 +73,13 @@ Copy csv_null_prefix_import FROM '${SQLNESS_HOME}/demo/export/csv_null_prefix.cs
|
||||
|
||||
select * from csv_null_prefix_import;
|
||||
|
||||
CREATE TABLE csv_skip_bad_records(host_id int, host_name string, reading_value double, ts timestamp time index);
|
||||
|
||||
-- SQLNESS ENV PWD
|
||||
Copy csv_skip_bad_records FROM '$PWD/tests/data/csv/skip_bad_records.csv' WITH (format='csv', skip_bad_records='true');
|
||||
|
||||
select * from csv_skip_bad_records order by ts;
|
||||
|
||||
drop table demo;
|
||||
|
||||
drop table with_filename;
|
||||
@@ -90,3 +97,5 @@ drop table demo_with_less_columns;
|
||||
drop table csv_null_prefix;
|
||||
|
||||
drop table csv_null_prefix_import;
|
||||
|
||||
drop table csv_skip_bad_records;
|
||||
|
||||
@@ -0,0 +1,130 @@
|
||||
CREATE DATABASE flow_join_fixture;
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
CREATE TABLE flow_join_fixture."left_samples" (
|
||||
source_id STRING,
|
||||
left_value DOUBLE,
|
||||
event_ts TIMESTAMP,
|
||||
observed_at TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE TABLE flow_join_fixture."right_samples" (
|
||||
source_id STRING,
|
||||
right_value DOUBLE,
|
||||
sample_kind STRING,
|
||||
event_ts TIMESTAMP,
|
||||
observed_at TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- Verify batching flow creation accepts aggregate subqueries joined by LEFT JOIN.
|
||||
CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
|
||||
EVAL INTERVAL '5m' AS
|
||||
SELECT
|
||||
l.source_id,
|
||||
l.measure_name,
|
||||
l.bucket_time,
|
||||
l.left_event_ts,
|
||||
l.left_value,
|
||||
r.right_event_ts,
|
||||
r.right_value
|
||||
FROM (
|
||||
SELECT
|
||||
source_id,
|
||||
'sample' AS measure_name,
|
||||
date_trunc('minute', now()) AS bucket_time,
|
||||
max(event_ts) AS left_event_ts,
|
||||
last_value(left_value ORDER BY observed_at) AS left_value
|
||||
FROM
|
||||
flow_join_fixture."left_samples"
|
||||
WHERE
|
||||
observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
|
||||
AND date_trunc('minute', now())
|
||||
GROUP BY
|
||||
source_id
|
||||
) l
|
||||
LEFT JOIN (
|
||||
SELECT
|
||||
source_id,
|
||||
'sample' AS measure_name,
|
||||
date_trunc('minute', now()) AS bucket_time,
|
||||
max(event_ts) AS right_event_ts,
|
||||
last_value(right_value ORDER BY observed_at) AS right_value
|
||||
FROM
|
||||
flow_join_fixture."right_samples"
|
||||
WHERE
|
||||
observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
|
||||
AND date_trunc('minute', now())
|
||||
AND sample_kind = 'primary'
|
||||
GROUP BY
|
||||
source_id
|
||||
) r ON l.source_id = r.source_id AND l.bucket_time = r.bucket_time;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
SELECT
|
||||
source_table_names LIKE '%left_samples%' AS has_left_source,
|
||||
source_table_names LIKE '%right_samples%' AS has_right_source,
|
||||
options LIKE '%"flow_type":"batching"%' AS is_batching_flow
|
||||
FROM
|
||||
INFORMATION_SCHEMA.FLOWS
|
||||
WHERE
|
||||
flow_name = 'flow_batch_join_subquery';
|
||||
|
||||
+-----------------+------------------+------------------+
|
||||
| has_left_source | has_right_source | is_batching_flow |
|
||||
+-----------------+------------------+------------------+
|
||||
| true | true | true |
|
||||
+-----------------+------------------+------------------+
|
||||
|
||||
INSERT INTO flow_join_fixture."left_samples" VALUES
|
||||
('source-a', 0.12, date_trunc('minute', now()), date_trunc('minute', now()));
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
INSERT INTO flow_join_fixture."right_samples" VALUES
|
||||
('source-a', 100.5, 'primary', date_trunc('minute', now()), date_trunc('minute', now()));
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED |
|
||||
ADMIN FLUSH_FLOW('flow_batch_join_subquery');
|
||||
|
||||
+----------------------------------------------+
|
||||
| ADMIN FLUSH_FLOW('flow_batch_join_subquery') |
|
||||
+----------------------------------------------+
|
||||
| FLOW_FLUSHED |
|
||||
+----------------------------------------------+
|
||||
|
||||
SELECT source_id, measure_name, left_value, right_value FROM flow_batch_join_sink ORDER BY source_id;
|
||||
|
||||
+-----------+--------------+------------+-------------+
|
||||
| source_id | measure_name | left_value | right_value |
|
||||
+-----------+--------------+------------+-------------+
|
||||
| source-a | sample | 0.12 | 100.5 |
|
||||
+-----------+--------------+------------+-------------+
|
||||
|
||||
DROP FLOW flow_batch_join_subquery;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP TABLE flow_batch_join_sink;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP TABLE flow_join_fixture."left_samples";
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP TABLE flow_join_fixture."right_samples";
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP DATABASE flow_join_fixture;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
@@ -0,0 +1,85 @@
|
||||
CREATE DATABASE flow_join_fixture;
|
||||
|
||||
CREATE TABLE flow_join_fixture."left_samples" (
|
||||
source_id STRING,
|
||||
left_value DOUBLE,
|
||||
event_ts TIMESTAMP,
|
||||
observed_at TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
CREATE TABLE flow_join_fixture."right_samples" (
|
||||
source_id STRING,
|
||||
right_value DOUBLE,
|
||||
sample_kind STRING,
|
||||
event_ts TIMESTAMP,
|
||||
observed_at TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
-- Verify batching flow creation accepts aggregate subqueries joined by LEFT JOIN.
|
||||
CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
|
||||
EVAL INTERVAL '5m' AS
|
||||
SELECT
|
||||
l.source_id,
|
||||
l.measure_name,
|
||||
l.bucket_time,
|
||||
l.left_event_ts,
|
||||
l.left_value,
|
||||
r.right_event_ts,
|
||||
r.right_value
|
||||
FROM (
|
||||
SELECT
|
||||
source_id,
|
||||
'sample' AS measure_name,
|
||||
date_trunc('minute', now()) AS bucket_time,
|
||||
max(event_ts) AS left_event_ts,
|
||||
last_value(left_value ORDER BY observed_at) AS left_value
|
||||
FROM
|
||||
flow_join_fixture."left_samples"
|
||||
WHERE
|
||||
observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
|
||||
AND date_trunc('minute', now())
|
||||
GROUP BY
|
||||
source_id
|
||||
) l
|
||||
LEFT JOIN (
|
||||
SELECT
|
||||
source_id,
|
||||
'sample' AS measure_name,
|
||||
date_trunc('minute', now()) AS bucket_time,
|
||||
max(event_ts) AS right_event_ts,
|
||||
last_value(right_value ORDER BY observed_at) AS right_value
|
||||
FROM
|
||||
flow_join_fixture."right_samples"
|
||||
WHERE
|
||||
observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
|
||||
AND date_trunc('minute', now())
|
||||
AND sample_kind = 'primary'
|
||||
GROUP BY
|
||||
source_id
|
||||
) r ON l.source_id = r.source_id AND l.bucket_time = r.bucket_time;
|
||||
|
||||
SELECT
|
||||
source_table_names LIKE '%left_samples%' AS has_left_source,
|
||||
source_table_names LIKE '%right_samples%' AS has_right_source,
|
||||
options LIKE '%"flow_type":"batching"%' AS is_batching_flow
|
||||
FROM
|
||||
INFORMATION_SCHEMA.FLOWS
|
||||
WHERE
|
||||
flow_name = 'flow_batch_join_subquery';
|
||||
|
||||
INSERT INTO flow_join_fixture."left_samples" VALUES
|
||||
('source-a', 0.12, date_trunc('minute', now()), date_trunc('minute', now()));
|
||||
|
||||
INSERT INTO flow_join_fixture."right_samples" VALUES
|
||||
('source-a', 100.5, 'primary', date_trunc('minute', now()), date_trunc('minute', now()));
|
||||
|
||||
-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED |
|
||||
ADMIN FLUSH_FLOW('flow_batch_join_subquery');
|
||||
|
||||
SELECT source_id, measure_name, left_value, right_value FROM flow_batch_join_sink ORDER BY source_id;
|
||||
|
||||
DROP FLOW flow_batch_join_subquery;
|
||||
DROP TABLE flow_batch_join_sink;
|
||||
DROP TABLE flow_join_fixture."left_samples";
|
||||
DROP TABLE flow_join_fixture."right_samples";
|
||||
DROP DATABASE flow_join_fixture;
|
||||
@@ -162,6 +162,8 @@ CREATE TABLE approx_rate (
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- Without merge_mode=last_non_null, this partial output is rejected at CREATE FLOW time.
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW find_approx_rate SINK TO approx_rate AS
|
||||
SELECT
|
||||
(max(byte) - min(byte)) / 30.0 as rate,
|
||||
@@ -172,24 +174,7 @@ from
|
||||
GROUP BY
|
||||
time_window;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
INSERT INTO
|
||||
bytes_log
|
||||
VALUES
|
||||
(NULL, '2023-01-01 00:00:01'),
|
||||
(300, '2023-01-01 00:00:31');
|
||||
|
||||
Affected Rows: 2
|
||||
|
||||
-- should return error
|
||||
ADMIN FLUSH_FLOW('find_approx_rate');
|
||||
|
||||
Error: 1002(Unexpected), Failed to execute admin function flush_flow: Execution error: Internal error: 1003
|
||||
|
||||
DROP FLOW find_approx_rate;
|
||||
|
||||
Affected Rows: 0
|
||||
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 3 flow output columns and 4 sink table columns. flow output columns: [\"rate\", \"time_window\", \"update_at\"], sink table columns: [\"rate\", \"time_window\", \"update_at\", \"bb\"], extra flow columns not in sink: [], missing sink columns from flow output: [\"bb\"]") in context: Failed to rewrite plan
|
||||
|
||||
DROP TABLE bytes_log;
|
||||
|
||||
|
||||
@@ -84,6 +84,8 @@ CREATE TABLE approx_rate (
|
||||
TIME INDEX(time_window)
|
||||
);
|
||||
|
||||
-- Without merge_mode=last_non_null, this partial output is rejected at CREATE FLOW time.
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW find_approx_rate SINK TO approx_rate AS
|
||||
SELECT
|
||||
(max(byte) - min(byte)) / 30.0 as rate,
|
||||
@@ -93,16 +95,5 @@ from
|
||||
bytes_log
|
||||
GROUP BY
|
||||
time_window;
|
||||
|
||||
INSERT INTO
|
||||
bytes_log
|
||||
VALUES
|
||||
(NULL, '2023-01-01 00:00:01'),
|
||||
(300, '2023-01-01 00:00:31');
|
||||
|
||||
-- should return error
|
||||
ADMIN FLUSH_FLOW('find_approx_rate');
|
||||
|
||||
DROP FLOW find_approx_rate;
|
||||
DROP TABLE bytes_log;
|
||||
DROP TABLE approx_rate;
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
-- Verify that batching flow rejects CREATE FLOW when the pre-existing sink
|
||||
-- table schema does not match the flow output (create-time validation, not runtime).
|
||||
CREATE TABLE source_mm (
|
||||
"number" INT,
|
||||
extra STRING,
|
||||
ts TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- Pre-create a sink table that is intentionally missing the "extra" column.
|
||||
-- This case validates batching mode at CREATE FLOW time, before any INSERT/FLUSH.
|
||||
CREATE TABLE sink_mm (
|
||||
"number" INT,
|
||||
time_window TIMESTAMP TIME INDEX,
|
||||
cnt BIGINT
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- This CREATE FLOW should fail immediately: the flow outputs (number, extra, time_window, cnt)
|
||||
-- but sink_mm has only (number, time_window, cnt).
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW mismatch_flow SINK TO sink_mm AS
|
||||
SELECT
|
||||
"number",
|
||||
extra,
|
||||
date_bin(INTERVAL '1 second', ts) as time_window,
|
||||
count(*) as cnt
|
||||
FROM
|
||||
source_mm
|
||||
GROUP BY
|
||||
"number", extra, time_window;
|
||||
|
||||
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 4 flow output columns and 3 sink table columns. flow output columns: [\"number\", \"extra\", \"time_window\", \"cnt\"], sink table columns: [\"number\", \"time_window\", \"cnt\"], extra flow columns not in sink: [\"extra\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
|
||||
|
||||
DROP TABLE source_mm;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP TABLE sink_mm;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- TQL/PromQL flows use the same create-time sink schema validation path.
|
||||
CREATE TABLE tql_source_mm (
|
||||
`value` DOUBLE,
|
||||
ts TIMESTAMP TIME INDEX,
|
||||
sensor STRING,
|
||||
loc STRING,
|
||||
PRIMARY KEY (sensor, loc)
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- Pre-create a TQL sink table that is intentionally missing the "sensor" tag column.
|
||||
CREATE TABLE tql_sink_mm (
|
||||
`value` DOUBLE,
|
||||
ts TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- This CREATE FLOW should fail immediately: the TQL output has (value, sensor, ts),
|
||||
-- but tql_sink_mm has only (value, ts).
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW tql_mismatch_flow
|
||||
SINK TO tql_sink_mm
|
||||
EVAL INTERVAL '1m' AS
|
||||
TQL EVAL (now() - '1m'::interval, now(), '1m')
|
||||
avg by(sensor) (tql_source_mm) AS value;
|
||||
|
||||
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 3 flow output columns and 2 sink table columns. flow output columns: [\"value\", \"sensor\", \"ts\"], sink table columns: [\"value\", \"ts\"], extra flow columns not in sink: [\"sensor\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
|
||||
|
||||
DROP TABLE tql_source_mm;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP TABLE tql_sink_mm;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- Real merge_mode=last_non_null sink options should enable partial schema validation.
|
||||
CREATE TABLE lnn_source_mm (
|
||||
device STRING,
|
||||
val DOUBLE,
|
||||
ts TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE TABLE lnn_sink_mm (
|
||||
device STRING,
|
||||
time_window TIMESTAMP TIME INDEX,
|
||||
cnt BIGINT,
|
||||
PRIMARY KEY (device)
|
||||
) WITH('merge_mode'='last_non_null');
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- This CREATE FLOW should fail through the last_non_null partial validator: the
|
||||
-- sink primary key "device" is required but absent from the flow output.
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW lnn_missing_pk_flow
|
||||
SINK TO lnn_sink_mm AS
|
||||
SELECT
|
||||
date_bin(INTERVAL '1 second', ts) as time_window,
|
||||
count(*) as cnt
|
||||
FROM
|
||||
lnn_source_mm
|
||||
GROUP BY
|
||||
time_window;
|
||||
|
||||
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Column(s) [\"device\"] required by sink table are missing from flow output when merge_mode=last_non_null. Flow output schema does not match sink table schema: found 2 flow output columns and 3 sink table columns. flow output columns: [\"time_window\", \"cnt\"], sink table columns: [\"device\", \"time_window\", \"cnt\"], extra flow columns not in sink: [], missing sink columns from flow output: [\"device\"]") in context: Failed to rewrite plan
|
||||
|
||||
DROP TABLE lnn_source_mm;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP TABLE lnn_sink_mm;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
-- Verify that batching flow rejects CREATE FLOW when the pre-existing sink
|
||||
-- table schema does not match the flow output (create-time validation, not runtime).
|
||||
CREATE TABLE source_mm (
|
||||
"number" INT,
|
||||
extra STRING,
|
||||
ts TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
-- Pre-create a sink table that is intentionally missing the "extra" column.
|
||||
-- This case validates batching mode at CREATE FLOW time, before any INSERT/FLUSH.
|
||||
CREATE TABLE sink_mm (
|
||||
"number" INT,
|
||||
time_window TIMESTAMP TIME INDEX,
|
||||
cnt BIGINT
|
||||
);
|
||||
|
||||
-- This CREATE FLOW should fail immediately: the flow outputs (number, extra, time_window, cnt)
|
||||
-- but sink_mm has only (number, time_window, cnt).
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW mismatch_flow SINK TO sink_mm AS
|
||||
SELECT
|
||||
"number",
|
||||
extra,
|
||||
date_bin(INTERVAL '1 second', ts) as time_window,
|
||||
count(*) as cnt
|
||||
FROM
|
||||
source_mm
|
||||
GROUP BY
|
||||
"number", extra, time_window;
|
||||
|
||||
DROP TABLE source_mm;
|
||||
DROP TABLE sink_mm;
|
||||
|
||||
-- TQL/PromQL flows use the same create-time sink schema validation path.
|
||||
CREATE TABLE tql_source_mm (
|
||||
`value` DOUBLE,
|
||||
ts TIMESTAMP TIME INDEX,
|
||||
sensor STRING,
|
||||
loc STRING,
|
||||
PRIMARY KEY (sensor, loc)
|
||||
);
|
||||
|
||||
-- Pre-create a TQL sink table that is intentionally missing the "sensor" tag column.
|
||||
CREATE TABLE tql_sink_mm (
|
||||
`value` DOUBLE,
|
||||
ts TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
-- This CREATE FLOW should fail immediately: the TQL output has (value, sensor, ts),
|
||||
-- but tql_sink_mm has only (value, ts).
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW tql_mismatch_flow
|
||||
SINK TO tql_sink_mm
|
||||
EVAL INTERVAL '1m' AS
|
||||
TQL EVAL (now() - '1m'::interval, now(), '1m')
|
||||
avg by(sensor) (tql_source_mm) AS value;
|
||||
|
||||
DROP TABLE tql_source_mm;
|
||||
DROP TABLE tql_sink_mm;
|
||||
|
||||
-- Real merge_mode=last_non_null sink options should enable partial schema validation.
|
||||
CREATE TABLE lnn_source_mm (
|
||||
device STRING,
|
||||
val DOUBLE,
|
||||
ts TIMESTAMP TIME INDEX
|
||||
);
|
||||
|
||||
CREATE TABLE lnn_sink_mm (
|
||||
device STRING,
|
||||
time_window TIMESTAMP TIME INDEX,
|
||||
cnt BIGINT,
|
||||
PRIMARY KEY (device)
|
||||
) WITH('merge_mode'='last_non_null');
|
||||
|
||||
-- This CREATE FLOW should fail through the last_non_null partial validator: the
|
||||
-- sink primary key "device" is required but absent from the flow output.
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW lnn_missing_pk_flow
|
||||
SINK TO lnn_sink_mm AS
|
||||
SELECT
|
||||
date_bin(INTERVAL '1 second', ts) as time_window,
|
||||
count(*) as cnt
|
||||
FROM
|
||||
lnn_source_mm
|
||||
GROUP BY
|
||||
time_window;
|
||||
|
||||
DROP TABLE lnn_source_mm;
|
||||
DROP TABLE lnn_sink_mm;
|
||||
@@ -0,0 +1,90 @@
|
||||
-- Regression for a TQL flow whose pre-created sink table is missing the value
|
||||
-- output column. The labels are intentionally minimal and anonymous.
|
||||
CREATE DATABASE source_schema;
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
CREATE DATABASE sink_schema;
|
||||
|
||||
Affected Rows: 1
|
||||
|
||||
USE source_schema;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE TABLE metric_input (
|
||||
namespace STRING NULL,
|
||||
app STRING NULL,
|
||||
greptime_timestamp TIMESTAMP(3) NOT NULL,
|
||||
greptime_value DOUBLE NULL,
|
||||
TIME INDEX (greptime_timestamp),
|
||||
PRIMARY KEY (namespace, app)
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
INSERT INTO metric_input VALUES
|
||||
('ns', 'app-a', '2026-01-23T03:40:00Z', 10.0),
|
||||
('ns', 'app-a', '2026-01-23T03:50:00Z', 20.0);
|
||||
|
||||
Affected Rows: 2
|
||||
|
||||
USE sink_schema;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- Intentionally omit greptime_value DOUBLE from the pre-created sink table.
|
||||
CREATE TABLE missing_value_sink (
|
||||
namespace STRING NULL,
|
||||
app STRING NULL,
|
||||
greptime_timestamp TIMESTAMP(3) NOT NULL,
|
||||
TIME INDEX (greptime_timestamp),
|
||||
PRIMARY KEY (namespace, app)
|
||||
)
|
||||
ENGINE=mito;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW missing_value_flow
|
||||
SINK TO sink_schema.missing_value_sink
|
||||
EVAL INTERVAL '3600 s'
|
||||
AS TQL EVAL (
|
||||
date_bin('2m'::interval, now() - '2m'::interval),
|
||||
date_bin('2m'::interval, now() - '2m'::interval),
|
||||
'1h'
|
||||
)
|
||||
avg by (namespace, app) (
|
||||
avg_over_time(metric_input{__schema__="source_schema"}[1h])
|
||||
);
|
||||
|
||||
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 4 flow output columns and 3 sink table columns. flow output columns: [\"namespace\", \"app\", \"greptime_timestamp\", \"avg(prom_avg_over_time(greptime_timestamp_range,greptime_value))\"], sink table columns: [\"namespace\", \"app\", \"greptime_timestamp\"], extra flow columns not in sink: [\"avg(prom_avg_over_time(greptime_timestamp_range,greptime_value))\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
|
||||
|
||||
DROP FLOW IF EXISTS missing_value_flow;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP TABLE missing_value_sink;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
USE source_schema;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP TABLE metric_input;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
USE public;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP DATABASE sink_schema;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
DROP DATABASE source_schema;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
-- Regression for a TQL flow whose pre-created sink table is missing the value
|
||||
-- output column. The labels are intentionally minimal and anonymous.
|
||||
|
||||
CREATE DATABASE source_schema;
|
||||
CREATE DATABASE sink_schema;
|
||||
|
||||
USE source_schema;
|
||||
|
||||
CREATE TABLE metric_input (
|
||||
namespace STRING NULL,
|
||||
app STRING NULL,
|
||||
greptime_timestamp TIMESTAMP(3) NOT NULL,
|
||||
greptime_value DOUBLE NULL,
|
||||
TIME INDEX (greptime_timestamp),
|
||||
PRIMARY KEY (namespace, app)
|
||||
);
|
||||
|
||||
INSERT INTO metric_input VALUES
|
||||
('ns', 'app-a', '2026-01-23T03:40:00Z', 10.0),
|
||||
('ns', 'app-a', '2026-01-23T03:50:00Z', 20.0);
|
||||
|
||||
USE sink_schema;
|
||||
|
||||
-- Intentionally omit greptime_value DOUBLE from the pre-created sink table.
|
||||
CREATE TABLE missing_value_sink (
|
||||
namespace STRING NULL,
|
||||
app STRING NULL,
|
||||
greptime_timestamp TIMESTAMP(3) NOT NULL,
|
||||
TIME INDEX (greptime_timestamp),
|
||||
PRIMARY KEY (namespace, app)
|
||||
)
|
||||
ENGINE=mito;
|
||||
|
||||
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
|
||||
CREATE FLOW missing_value_flow
|
||||
SINK TO sink_schema.missing_value_sink
|
||||
EVAL INTERVAL '3600 s'
|
||||
AS TQL EVAL (
|
||||
date_bin('2m'::interval, now() - '2m'::interval),
|
||||
date_bin('2m'::interval, now() - '2m'::interval),
|
||||
'1h'
|
||||
)
|
||||
avg by (namespace, app) (
|
||||
avg_over_time(metric_input{__schema__="source_schema"}[1h])
|
||||
);
|
||||
|
||||
DROP FLOW IF EXISTS missing_value_flow;
|
||||
DROP TABLE missing_value_sink;
|
||||
|
||||
USE source_schema;
|
||||
DROP TABLE metric_input;
|
||||
|
||||
USE public;
|
||||
DROP DATABASE sink_schema;
|
||||
DROP DATABASE source_schema;
|
||||
4
tests/data/csv/skip_bad_records.csv
Normal file
4
tests/data/csv/skip_bad_records.csv
Normal file
@@ -0,0 +1,4 @@
|
||||
host_id,host_name,reading_value,ts
|
||||
1,Alice,10.5,2024-01-01T00:00:00
|
||||
bad,Bad,20.0,2024-01-01T00:00:01
|
||||
2,Bob,30.5,2024-01-01T00:00:02
|
||||
|
Reference in New Issue
Block a user