Merge commit 'd304df6e75' into feat/flush-hook-extension-point

This commit is contained in:
Ning Sun
2026-06-04 04:20:27 -07:00
72 changed files with 3536 additions and 423 deletions

View File

@@ -30,7 +30,7 @@ on:
linux_arm64_runner:
type: choice
description: The runner uses to build linux-arm64 artifacts
default: ec2-c6g.4xlarge-arm64
default: ec2-c6g.8xlarge-arm64
options:
- ec2-c6g.xlarge-arm64 # 4C8G
- ec2-c6g.2xlarge-arm64 # 8C16G

View File

@@ -27,7 +27,7 @@ on:
linux_arm64_runner:
type: choice
description: The runner uses to build linux-arm64 artifacts
default: ec2-c6g.4xlarge-arm64
default: ec2-c6g.8xlarge-arm64
options:
- ec2-c6g.xlarge-arm64 # 4C8G
- ec2-c6g.2xlarge-arm64 # 8C16G

View File

@@ -1,19 +1,81 @@
name: Nightly JSONBench
on:
schedule:
# Trigger at 00:00(Asia/Shanghai) on every weekday.
- cron: "0 16 * * 0-4"
workflow_run:
workflows: [ "GreptimeDB Nightly Build" ]
types: [ completed ]
workflow_dispatch:
inputs:
run_id:
description: The nightly build workflow run id to download GreptimeDB artifacts from
required: true
type: string
permissions:
actions: read
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
resolve-artifact:
name: Resolve GreptimeDB nightly artifact
if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
runs-on: ubuntu-latest
outputs:
artifact-name: ${{ steps.find-artifact.outputs.artifact-name }}
run-id: ${{ steps.resolve-run-id.outputs.run-id }}
steps:
- name: Resolve nightly build run id
id: resolve-run-id
shell: bash
env:
EVENT_NAME: ${{ github.event_name }}
WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
INPUT_RUN_ID: ${{ inputs.run_id }}
run: |
set -euo pipefail
if [[ "${EVENT_NAME}" == "workflow_dispatch" ]]; then
run_id="${INPUT_RUN_ID}"
else
run_id="${WORKFLOW_RUN_ID}"
fi
if [[ ! "${run_id}" =~ ^[0-9]+$ ]]; then
echo "Invalid workflow run id: ${run_id}"
exit 1
fi
echo "run-id=${run_id}" >> "${GITHUB_OUTPUT}"
- name: Find GreptimeDB nightly artifact
id: find-artifact
shell: bash
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RUN_ID: ${{ steps.resolve-run-id.outputs.run-id }}
run: |
set -euo pipefail
artifact_name=$(gh api "repos/${GITHUB_REPOSITORY}/actions/runs/${RUN_ID}/artifacts" --paginate \
--jq '.artifacts[] | select(.name | test("^greptime-linux-arm64-nightly-[0-9]{8}-[0-9a-f]+$")) | .name' \
| head -n 1)
if [[ -z "${artifact_name}" ]]; then
echo "Cannot find linux arm64 nightly artifact in workflow run ${RUN_ID}."
exit 1
fi
echo "Download GreptimeDB artifact: ${artifact_name}"
echo "artifact-name=${artifact_name}" >> "${GITHUB_OUTPUT}"
allocate-runner:
name: Allocate runner
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
needs: [ resolve-artifact ]
runs-on: ubuntu-latest
outputs:
linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
@@ -43,55 +105,50 @@ jobs:
jsonbench:
name: Run JSONBench
if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
needs: [ allocate-runner ]
if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
needs: [ resolve-artifact, allocate-runner ]
runs-on: ${{ needs.allocate-runner.outputs.linux-arm64-runner }}
timeout-minutes: 120
env:
JSONBENCH_DATA_DIR: /home/runner/data/bluesky
JSONBENCH_OUTPUT_PREFIX: _ubuntu-latest
JSONBENCH_OUTPUT_PREFIX: _linux-arm64
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download GreptimeDB nightly artifact
uses: actions/download-artifact@v4
with:
fetch-depth: 0
persist-credentials: false
name: ${{ needs.resolve-artifact.outputs.artifact-name }}
path: greptimedb-artifact
github-token: ${{ secrets.GITHUB_TOKEN }}
run-id: ${{ needs.resolve-artifact.outputs.run-id }}
- uses: arduino/setup-protoc@v3
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
- uses: actions-rust-lang/setup-rust-toolchain@v1
- name: Rust Cache
uses: Swatinem/rust-cache@v2
with:
shared-key: "nightly-jsonbench"
cache-all-crates: "true"
save-if: ${{ github.ref == 'refs/heads/main' }}
- name: Build GreptimeDB
run: cargo build --profile nightly --bin greptime
- name: Reclaim disk space
- name: Prepare GreptimeDB binary
shell: bash
run: |
set -euo pipefail
mkdir -p "${RUNNER_TEMP}/greptimedb-bin"
cp ./target/nightly/greptime "${RUNNER_TEMP}/greptimedb-bin/greptime"
chmod +x "${RUNNER_TEMP}/greptimedb-bin/greptime"
rm -rf ./target
tar -xzf "greptimedb-artifact/${{ needs.resolve-artifact.outputs.artifact-name }}.tar.gz"
cp "${{ needs.resolve-artifact.outputs.artifact-name }}/greptime" ./greptime
chmod +x ./greptime
rm -rf greptimedb-artifact "${{ needs.resolve-artifact.outputs.artifact-name }}"
- name: Run JSONBench
env:
# TODO(LFC): Change to "3" (100m) when JSON2 ingestion performance is optimized.
JSONBENCH_DATASET: 2
shell: bash
run: |
set -euo pipefail
cd "${RUNNER_TEMP}"
cp "${RUNNER_TEMP}/greptimedb-bin/greptime" ./greptime
chmod +x ./greptime
export JSONBENCH_DATA_DIR="/root/data/bluesky"
echo "Use JSONBench data directory ${JSONBENCH_DATA_DIR}"
echo "Cloning JSONBench"
git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
echo "Downloading JSONBench dataset choice ${JSONBENCH_DATASET} to ${JSONBENCH_DATA_DIR}"
mkdir -p "${JSONBENCH_DATA_DIR}"
printf "${JSONBENCH_DATASET}\n" | ./JSONBench/download_data.sh
downloaded_files=$(find "${JSONBENCH_DATA_DIR}" -type f | wc -l)
echo "Downloaded JSONBench dataset files: ${downloaded_files}"
export GREPTIMEDB_STANDALONE__WAL__DIR=greptimedb_data/wal
export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME=greptimedb_data
@@ -100,10 +157,12 @@ jobs:
export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB
export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s
echo "Starting GreptimeDB standalone"
./greptime standalone start > greptimedb.log 2>&1 &
greptime_pid=$!
trap 'kill "${greptime_pid}" 2>/dev/null || true' EXIT
echo "Waiting for GreptimeDB health check"
until curl -s --fail -o /dev/null http://localhost:4000/health; do
if ! kill -0 "${greptime_pid}" 2>/dev/null; then
cat greptimedb.log
@@ -111,12 +170,14 @@ jobs:
fi
sleep 1
done
echo "GreptimeDB is ready"
git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
cp ./greptime JSONBench/greptimedb/greptime
cd JSONBench/greptimedb
./main.sh 3 "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
echo "Running JSONBench main.sh with dataset choice ${JSONBENCH_DATASET} and install=false"
./main.sh ${JSONBENCH_DATASET} "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
echo "JSONBench finished"
- name: Upload JSONBench results
if: always()
@@ -124,21 +185,21 @@ jobs:
with:
name: jsonbench-results
path: |
${{ runner.temp }}/greptimedb.log
${{ runner.temp }}/JSONBench/greptimedb/*.log
${{ runner.temp }}/JSONBench/greptimedb/*.total_size
${{ runner.temp }}/JSONBench/greptimedb/*.data_size
${{ runner.temp }}/JSONBench/greptimedb/*.index_size
${{ runner.temp }}/JSONBench/greptimedb/*.count
${{ runner.temp }}/JSONBench/greptimedb/*.results_runtime
${{ runner.temp }}/JSONBench/greptimedb/*.query_results
./greptimedb.log
./JSONBench/greptimedb/*.log
./JSONBench/greptimedb/*.total_size
./JSONBench/greptimedb/*.data_size
./JSONBench/greptimedb/*.index_size
./JSONBench/greptimedb/*.count
./JSONBench/greptimedb/*.results_runtime
./JSONBench/greptimedb/*.query_results
if-no-files-found: ignore
retention-days: 7
stop-linux-arm64-runner:
name: Stop Linux ARM64 runner
# It's always run as the last job in the workflow to make sure that the runner is released.
if: ${{ always() }}
if: ${{ always() && needs.allocate-runner.outputs.linux-arm64-ec2-runner-instance-id != '' }}
runs-on: ubuntu-latest
needs: [
allocate-runner,

683
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -259,7 +259,7 @@ tracing-opentelemetry = "0.31.0"
tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] }
typetag = "0.2"
uuid = { version = "1.17", features = ["serde", "v4", "v7", "fast-rng"] }
vrl = "0.25"
vrl = "0.33"
zstd = "0.13"
# DO_NOT_REMOVE_THIS: END_OF_EXTERNAL_DEPENDENCIES

View File

@@ -451,6 +451,7 @@
| `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
| `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
| `max_concurrent_queries` | Integer | `0` | The maximum concurrent queries allowed to be executed. Zero means unlimited. |
| `concurrent_query_limiter_timeout` | String | `100ms` | Timeout to acquire a permit from the concurrent query limiter when `max_concurrent_queries` is reached. |
| `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
| `http` | -- | -- | The HTTP server options. |
| `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |

View File

@@ -20,6 +20,9 @@ init_regions_parallelism = 16
## The maximum concurrent queries allowed to be executed. Zero means unlimited.
max_concurrent_queries = 0
## Timeout to acquire a permit from the concurrent query limiter when `max_concurrent_queries` is reached.
concurrent_query_limiter_timeout = "100ms"
## Enable telemetry to collect anonymous usage data. Enabled by default.
#+ enable_telemetry = true

View File

@@ -1077,7 +1077,9 @@ async fn verify_snapshot(storage: &OpenDalStorage) -> Result<VerifyReport> {
));
}
let data_files = storage.list_files_recursive("data/").await?;
if let Some(path) = data_files.first() {
// Report the lexicographically smallest path so the message is stable
// regardless of listing order across backends.
if let Some(path) = data_files.iter().min() {
report.push_error(format!(
"Schema-only snapshot should not contain data files (found '{}')",
path
@@ -1103,75 +1105,113 @@ fn summarize_chunks(manifest: &Manifest) -> VerifyChunkSummary {
}
}
/// A data file declared by a completed chunk that is expected to exist in storage.
#[derive(Debug)]
struct ChunkFile {
chunk_id: u32,
path: String,
}
/// Expected snapshot contents derived purely from the manifest (no object-store IO).
///
/// Separating planning from scanning makes it obvious which problems come from
/// the manifest alone and which require comparing against actual storage.
#[derive(Debug, Default)]
struct VerifyPlan {
/// Valid data files declared by completed chunks; each must exist in storage.
files_to_check: Vec<ChunkFile>,
/// All syntactically-safe data paths declared by any chunk, regardless of
/// status. Used as the orphan-detection baseline so a listed-but-invalid
/// file is not also reported as unexpected.
claimed_data_files: HashSet<String>,
/// Total data-file references in completed chunks (valid + invalid).
data_files_total: usize,
/// Problems detectable from the manifest alone.
problems: Vec<VerifyProblem>,
}
/// Actual data files discovered under `data/` (the only object-store IO in
/// chunk/data-file verification).
#[derive(Debug)]
struct VerifyDataScan {
existing_data_files: HashSet<String>,
}
/// Result of reconciling the manifest plan against the storage scan.
#[derive(Debug, Default)]
struct VerifyOutcome {
data_files_total: usize,
data_files_verified: usize,
problems: Vec<VerifyProblem>,
}
async fn verify_chunks_and_data_files(
storage: &OpenDalStorage,
report: &mut VerifyReport,
) -> Result<()> {
let existing_files: HashSet<_> = storage
.list_files_recursive("data/")
.await?
.into_iter()
.collect();
let mut data_files_total = 0;
let mut data_files_verified = 0;
let mut problems = Vec::new();
let mut seen_chunk_ids = HashSet::new();
let mut claimed_data_files = HashSet::new();
let plan = build_verify_plan(&report.manifest);
let scan = scan_data_files(storage).await?;
let outcome = reconcile_plan_with_scan(plan, &scan);
for chunk in &report.manifest.chunks {
report.data_files_total = outcome.data_files_total;
report.data_files_verified = outcome.data_files_verified;
report.problems.extend(outcome.problems);
Ok(())
}
/// Builds the expected-state plan from the manifest. Pure; performs no IO.
fn build_verify_plan(manifest: &Manifest) -> VerifyPlan {
let mut plan = VerifyPlan::default();
let mut seen_chunk_ids = HashSet::new();
for chunk in &manifest.chunks {
if !seen_chunk_ids.insert(chunk.id) {
problems.push(VerifyProblem {
plan.problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!("Chunk {}: duplicate chunk id", chunk.id),
});
}
for file in &chunk.files {
if let Some(path) = safe_manifest_data_file_path(file) {
claimed_data_files.insert(path.to_string());
plan.claimed_data_files.insert(path.to_string());
}
}
match chunk.status {
ChunkStatus::Completed => {
if chunk.files.is_empty() {
problems.push(VerifyProblem {
plan.problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!("Chunk {}: completed chunk has no data files", chunk.id),
});
continue;
}
let allowed_prefixes = report
.manifest
let allowed_prefixes = manifest
.schemas
.iter()
.map(|schema| data_dir_for_schema_chunk(schema, chunk.id))
.collect::<Vec<_>>();
for file in &chunk.files {
data_files_total += 1;
let Some(path) = valid_manifest_data_file_path(file, &allowed_prefixes) else {
problems.push(VerifyProblem {
plan.data_files_total += 1;
match valid_manifest_data_file_path(file, &allowed_prefixes) {
Some(path) => plan.files_to_check.push(ChunkFile {
chunk_id: chunk.id,
path: path.to_string(),
}),
None => plan.problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!(
"Chunk {}: invalid data file path '{}'",
chunk.id, file
),
});
continue;
};
if existing_files.contains(path) {
data_files_verified += 1;
} else {
problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!("Chunk {}: missing file '{}'", chunk.id, path),
});
}),
}
}
}
ChunkStatus::Skipped => {
if !chunk.files.is_empty() {
problems.push(VerifyProblem {
plan.problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!(
"Chunk {}: skipped chunk should not list data files",
@@ -1181,20 +1221,20 @@ async fn verify_chunks_and_data_files(
}
}
ChunkStatus::Pending => {
problems.push(VerifyProblem {
plan.problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!("Chunk {}: status is 'pending'", chunk.id),
});
}
ChunkStatus::InProgress => {
problems.push(VerifyProblem {
plan.problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!("Chunk {}: status is 'in_progress'", chunk.id),
});
}
ChunkStatus::Failed => {
let reason = chunk.error.as_deref().unwrap_or("unknown error");
problems.push(VerifyProblem {
plan.problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!("Chunk {}: status is 'failed' (error: {})", chunk.id, reason),
});
@@ -1202,20 +1242,60 @@ async fn verify_chunks_and_data_files(
}
}
for path in &existing_files {
if !claimed_data_files.contains(path) {
plan
}
/// Lists all data files under `data/`. This is the only object-store IO in
/// chunk/data-file verification.
async fn scan_data_files(storage: &OpenDalStorage) -> Result<VerifyDataScan> {
let existing_data_files = storage
.list_files_recursive("data/")
.await?
.into_iter()
.collect();
Ok(VerifyDataScan {
existing_data_files,
})
}
/// Reconciles the manifest plan against the storage scan. Pure; performs no IO.
///
/// Emits missing-file problems for expected files absent from storage and
/// unexpected-file problems for storage files no chunk claims. Unexpected files
/// are sorted by path so output is deterministic regardless of listing order.
fn reconcile_plan_with_scan(plan: VerifyPlan, scan: &VerifyDataScan) -> VerifyOutcome {
let mut problems = plan.problems;
let mut data_files_verified = 0;
for file in &plan.files_to_check {
if scan.existing_data_files.contains(&file.path) {
data_files_verified += 1;
} else {
problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!("Unexpected data file '{}' is not listed in manifest", path),
message: format!("Chunk {}: missing file '{}'", file.chunk_id, file.path),
});
}
}
report.data_files_total = data_files_total;
report.data_files_verified = data_files_verified;
report.problems.extend(problems);
let mut orphans: Vec<&String> = scan
.existing_data_files
.iter()
.filter(|path| !plan.claimed_data_files.contains(*path))
.collect();
orphans.sort();
for path in orphans {
problems.push(VerifyProblem {
severity: VerifySeverity::Error,
message: format!("Unexpected data file '{}' is not listed in manifest", path),
});
}
Ok(())
VerifyOutcome {
data_files_total: plan.data_files_total,
data_files_verified,
problems,
}
}
fn valid_manifest_data_file_path<'a>(
@@ -2294,6 +2374,90 @@ mod tests {
);
}
#[test]
fn test_build_verify_plan_classifies_chunks_without_io() {
let mut manifest = test_manifest(
chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
false,
true,
);
// test_manifest(complete) gives: chunk 1 completed (1 file), chunk 2 skipped.
let mut failed = ChunkMeta::new(3, TimeRange::unbounded());
failed.mark_failed("boom".to_string());
manifest.chunks.push(failed);
manifest
.chunks
.push(ChunkMeta::new(4, TimeRange::unbounded()));
let plan = build_verify_plan(&manifest);
assert_eq!(plan.files_to_check.len(), 1);
assert_eq!(plan.files_to_check[0].chunk_id, 1);
assert_eq!(plan.files_to_check[0].path, "data/public/1/file.parquet");
assert_eq!(plan.data_files_total, 1);
assert!(
plan.claimed_data_files
.contains("data/public/1/file.parquet")
);
assert_eq!(plan.problems.len(), 2);
assert!(
plan.problems
.iter()
.any(|problem| problem.message.contains("status is 'failed'"))
);
assert!(
plan.problems
.iter()
.any(|problem| problem.message.contains("status is 'pending'"))
);
}
#[tokio::test]
async fn test_verify_snapshot_produces_deterministic_problem_output() {
let dir = tempdir().unwrap();
let manifest = test_manifest(
chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
false,
true,
);
write_root_manifest(dir.path(), manifest);
write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
write_default_ddl_files(dir.path());
write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
// Many orphan files under a known chunk prefix to stress ordering.
for i in 0..50 {
write_snapshot_file(
dir.path(),
&format!("data/public/1/orphan_{:02}.parquet", i),
b"x",
);
}
let storage = file_storage_for_dir(dir.path());
let messages = |report: &VerifyReport| {
report
.problems
.iter()
.map(|problem| problem.message.clone())
.collect::<Vec<_>>()
};
let first = messages(&verify_snapshot(&storage).await.unwrap());
let second = messages(&verify_snapshot(&storage).await.unwrap());
// Output is identical across runs despite HashSet-based scanning.
assert_eq!(first, second);
let orphans = first
.iter()
.filter(|message| message.contains("Unexpected data file"))
.cloned()
.collect::<Vec<_>>();
assert_eq!(orphans.len(), 50);
let mut sorted = orphans.clone();
sorted.sort();
assert_eq!(orphans, sorted);
}
fn write_test_manifest(root: &std::path::Path, dir: &str, manifest: Manifest) {
let snapshot_dir = root.join(dir);
std::fs::create_dir_all(&snapshot_dir).unwrap();

View File

@@ -524,6 +524,7 @@ impl ScanbenchCommand {
options: HashMap::default(),
skip_wal_replay: !self.enable_wal,
checkpoint: None,
requirements: Default::default(),
};
engine

View File

@@ -61,6 +61,7 @@ pub const FORMAT_COMPRESSION_TYPE: &str = "compression_type";
pub const FORMAT_DELIMITER: &str = "delimiter";
pub const FORMAT_SCHEMA_INFER_MAX_RECORD: &str = "schema_infer_max_record";
pub const FORMAT_HAS_HEADER: &str = "has_header";
pub const FORMAT_SKIP_BAD_RECORDS: &str = "skip_bad_records";
pub const FORMAT_TYPE: &str = "format";
pub const FILE_PATTERN: &str = "pattern";
pub const TIMESTAMP_FORMAT: &str = "timestamp_format";

View File

@@ -13,15 +13,24 @@
// limitations under the License.
use std::collections::HashMap;
use std::io;
use std::str::FromStr;
use std::sync::Arc;
use std::task::Poll;
use arrow::csv::reader::Format;
use arrow::csv::{self, WriterBuilder};
use arrow::error::ArrowError;
use arrow::record_batch::RecordBatch;
use arrow_schema::Schema;
use arrow_schema::{Schema, SchemaRef};
use async_trait::async_trait;
use bytes::{Buf, Bytes};
use common_runtime;
use common_telemetry::warn;
use datafusion::physical_plan::SendableRecordBatchStream;
use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
use futures::StreamExt;
use futures::stream::BoxStream;
use object_store::ObjectStore;
use snafu::ResultExt;
use tokio_util::compat::FuturesAsyncReadCompatExt;
@@ -34,9 +43,12 @@ use crate::file_format::{self, FileFormat, stream_to_file};
use crate::share_buffer::SharedBuffer;
use crate::util::normalize_infer_schema;
const SKIP_BAD_RECORDS_BATCH_SIZE: usize = 1;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CsvFormat {
pub has_header: bool,
pub skip_bad_records: bool,
pub delimiter: u8,
pub schema_infer_max_record: Option<usize>,
pub compression_type: CompressionType,
@@ -76,13 +88,11 @@ impl TryFrom<&HashMap<String, String>> for CsvFormat {
})?);
};
if let Some(has_header) = value.get(file_format::FORMAT_HAS_HEADER) {
format.has_header = has_header.parse().map_err(|_| {
error::ParseFormatSnafu {
key: file_format::FORMAT_HAS_HEADER,
value: has_header,
}
.build()
})?;
format.has_header = parse_bool(file_format::FORMAT_HAS_HEADER, has_header)?;
};
if let Some(skip_bad_records) = value.get(file_format::FORMAT_SKIP_BAD_RECORDS) {
format.skip_bad_records =
parse_bool(file_format::FORMAT_SKIP_BAD_RECORDS, skip_bad_records)?;
};
if let Some(timestamp_format) = value.get(file_format::TIMESTAMP_FORMAT) {
format.timestamp_format = Some(timestamp_format.clone());
@@ -97,10 +107,17 @@ impl TryFrom<&HashMap<String, String>> for CsvFormat {
}
}
fn parse_bool(key: &'static str, value: &str) -> Result<bool> {
value
.parse()
.map_err(|_| error::ParseFormatSnafu { key, value }.build())
}
impl Default for CsvFormat {
fn default() -> Self {
Self {
has_header: true,
skip_bad_records: false,
delimiter: b',',
schema_infer_max_record: Some(file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD),
compression_type: CompressionType::Uncompressed,
@@ -189,10 +206,136 @@ impl DfRecordBatchEncoder for csv::Writer<SharedBuffer> {
}
}
/// Builds a CSV stream that can skip selected record-level parse/cast errors.
///
/// This recovery path intentionally uses one-record batches. It is slower than
/// normal CSV scanning, but keeps each parse/cast failure isolated to a single
/// record. Arrow's CSV decoder clears buffered rows before type parsing, so a
/// failed multi-row flush cannot be safely retried row by row without replaying
/// input bytes.
pub async fn tolerant_csv_stream(
store: &ObjectStore,
path: &str,
schema: SchemaRef,
projection: Vec<usize>,
format: &CsvFormat,
) -> Result<SendableRecordBatchStream> {
let meta = store
.stat(path)
.await
.context(error::ReadObjectSnafu { path })?;
let reader = store
.reader(path)
.await
.context(error::ReadObjectSnafu { path })?
.into_bytes_stream(0..meta.content_length())
.await
.context(error::ReadObjectSnafu { path })?;
let reader = format.compression_type.convert_stream(reader).boxed();
tolerant_csv_stream_from_reader(
reader,
path,
schema,
projection,
format.has_header,
format.delimiter,
)
}
fn tolerant_csv_stream_from_reader(
reader: BoxStream<'static, io::Result<Bytes>>,
path: &str,
schema: SchemaRef,
projection: Vec<usize>,
has_header: bool,
delimiter: u8,
) -> Result<SendableRecordBatchStream> {
let projected_schema = Arc::new(
schema
.project(&projection)
.context(error::InferSchemaSnafu)?,
);
let mut decoder = csv::ReaderBuilder::new(schema)
.with_header(has_header)
.with_delimiter(delimiter)
.with_batch_size(SKIP_BAD_RECORDS_BATCH_SIZE)
.with_projection(projection)
.build_decoder();
let path = path.to_string();
let mut upstream = reader.fuse();
let mut buffered = Bytes::new();
let mut input_finished = false;
let stream = futures::stream::poll_fn(move |cx| {
loop {
while !input_finished {
if buffered.is_empty() {
match futures::ready!(upstream.poll_next_unpin(cx)) {
Some(Ok(bytes)) if bytes.is_empty() => continue,
Some(Ok(bytes)) => buffered = bytes,
Some(Err(error)) => return Poll::Ready(Some(Err(error.into()))),
None => input_finished = true,
}
}
let decoded = decoder.decode(buffered.as_ref())?;
if decoded > 0 {
buffered.advance(decoded);
continue;
}
if decoder.capacity() == 0 || input_finished {
break;
}
if buffered.is_empty() {
continue;
}
return Poll::Ready(Some(Err(ArrowError::ParseError(
"CSV decoder made no progress while input bytes remain".to_string(),
))));
}
match decoder.flush() {
Ok(Some(batch)) => return Poll::Ready(Some(Ok(batch))),
Ok(None) if input_finished => return Poll::Ready(None),
Ok(None) => continue,
Err(error) if is_skippable_arrow_error(&error) => {
warn!(
"Skipping bad CSV record while copying from {}: {}",
path, error
);
}
Err(error) => return Poll::Ready(Some(Err(error))),
}
}
})
.map(|result: std::result::Result<RecordBatch, ArrowError>| result.map_err(Into::into));
Ok(Box::pin(RecordBatchStreamAdapter::new(
projected_schema,
stream,
)))
}
pub fn is_skippable_arrow_error(error: &ArrowError) -> bool {
matches!(
error,
ArrowError::ParseError(_)
| ArrowError::CastError(_)
| ArrowError::ComputeError(_)
| ArrowError::InvalidArgumentError(_)
)
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow_schema::{DataType, Field};
use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
use common_recordbatch::{RecordBatch, RecordBatches};
use common_test_util::find_workspace_path;
@@ -205,7 +348,7 @@ mod tests {
use super::*;
use crate::file_format::{
FORMAT_COMPRESSION_TYPE, FORMAT_DELIMITER, FORMAT_HAS_HEADER,
FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat, file_to_stream,
FORMAT_SCHEMA_INFER_MAX_RECORD, FORMAT_SKIP_BAD_RECORDS, FileFormat, file_to_stream,
};
use crate::test_util::{format_schema, test_store};
@@ -331,11 +474,29 @@ mod tests {
schema_infer_max_record: Some(2000),
delimiter: b'\t',
has_header: false,
skip_bad_records: false,
timestamp_format: None,
time_format: None,
date_format: None
}
);
let map = HashMap::from([(FORMAT_SKIP_BAD_RECORDS.to_string(), "true".to_string())]);
let format = CsvFormat::try_from(&map).unwrap();
assert_eq!(
format,
CsvFormat {
skip_bad_records: true,
..CsvFormat::default()
}
);
}
#[test]
fn test_try_from_rejects_invalid_bool_options() {
let map = HashMap::from([(FORMAT_SKIP_BAD_RECORDS.to_string(), "yes".to_string())]);
assert!(CsvFormat::try_from(&map).is_err());
}
#[tokio::test]
@@ -496,4 +657,63 @@ mod tests {
assert_eq!(expected, pretty_print);
}
}
#[tokio::test]
async fn test_tolerant_csv_stream_continues_after_parse_error() {
let temp_dir = common_test_util::temp_dir::create_temp_dir("test_tolerant_csv_stream");
let csv_file_path = temp_dir.path().join("input.csv");
std::fs::write(
&csv_file_path,
"id,name,value\n1,Alice,10.5\nbad,Bad,20.0\nworse,Bad,21.0\n2,Bob,30.5",
)
.unwrap();
let store = test_store("/");
let schema = Arc::new(arrow_schema::Schema::new(vec![
Field::new("id", DataType::UInt32, false),
Field::new("name", DataType::Utf8, false),
Field::new("value", DataType::Float64, false),
]));
let path = csv_file_path.to_str().unwrap();
let stream =
tolerant_csv_stream(&store, path, schema, vec![0, 1, 2], &CsvFormat::default())
.await
.unwrap();
let batches = stream.try_collect::<Vec<_>>().await.unwrap();
let pretty_print = arrow::util::pretty::pretty_format_batches(&batches)
.unwrap()
.to_string();
let expected = r#"+----+-------+-------+
| id | name | value |
+----+-------+-------+
| 1 | Alice | 10.5 |
| 2 | Bob | 30.5 |
+----+-------+-------+"#;
assert_eq!(expected, pretty_print);
}
#[tokio::test]
async fn test_tolerant_csv_stream_fails_on_structural_csv_error() {
let temp_dir =
common_test_util::temp_dir::create_temp_dir("test_tolerant_csv_stream_csv_error");
let csv_file_path = temp_dir.path().join("input.csv");
std::fs::write(&csv_file_path, "id,name,value\n1,Alice,10.5\n2,Bob\n").unwrap();
let store = test_store("/");
let schema = Arc::new(arrow_schema::Schema::new(vec![
Field::new("id", DataType::UInt32, false),
Field::new("name", DataType::Utf8, false),
Field::new("value", DataType::Float64, false),
]));
let path = csv_file_path.to_str().unwrap();
let stream =
tolerant_csv_stream(&store, path, schema, vec![0, 1, 2], &CsvFormat::default())
.await
.unwrap();
let error = stream.try_collect::<Vec<_>>().await.unwrap_err();
assert!(error.to_string().contains("incorrect number of fields"));
}
}

View File

@@ -18,7 +18,7 @@ use std::time::Duration;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use store_api::region_engine::SyncRegionFromRequest;
use store_api::region_request::RegionFlushReason;
use store_api::region_request::{RegionFlushReason, RegionRequirements};
use store_api::storage::{FileRefsManifest, GcReport, RegionId, RegionNumber};
use strum::Display;
use table::metadata::TableId;
@@ -179,12 +179,24 @@ impl Display for OpenRegion {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
"OpenRegion(region_ident={}, region_storage_path={})",
self.region_ident, self.region_storage_path
"OpenRegion(region_ident={}, region_storage_path={}, reason={:?})",
self.region_ident, self.region_storage_path, self.reason
)
}
}
/// The reason why an open region instruction is triggered.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
pub enum OpenRegionReason {
/// Open triggered before region migration.
RegionMigration,
/// Open triggered by region failover.
RegionFailover,
/// Open triggered when adding a follower region.
#[cfg(feature = "enterprise")]
RegionFollower,
}
#[serde_with::serde_as]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct OpenRegion {
@@ -196,6 +208,10 @@ pub struct OpenRegion {
pub region_wal_options: HashMap<RegionNumber, String>,
#[serde(default)]
pub skip_wal_replay: bool,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub reason: Option<OpenRegionReason>,
#[serde(default)]
pub requirements: RegionRequirements,
}
impl OpenRegion {
@@ -205,6 +221,8 @@ impl OpenRegion {
region_options: HashMap<String, String>,
region_wal_options: HashMap<RegionNumber, String>,
skip_wal_replay: bool,
reason: Option<OpenRegionReason>,
requirements: RegionRequirements,
) -> Self {
Self {
region_ident,
@@ -212,6 +230,8 @@ impl OpenRegion {
region_options,
region_wal_options,
skip_wal_replay,
reason,
requirements,
}
}
}
@@ -1126,11 +1146,13 @@ mod tests {
HashMap::new(),
HashMap::new(),
false,
None,
RegionRequirements::empty(),
)]);
let serialized = serde_json::to_string(&open_region).unwrap();
assert_eq!(
r#"{"OpenRegions":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}]}"#,
r#"{"OpenRegions":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false,"requirements":{"object_storage":false}}]}"#,
serialized
);
@@ -1213,6 +1235,8 @@ mod tests {
HashMap::new(),
HashMap::new(),
false,
None,
RegionRequirements::empty(),
)]);
assert_eq!(open_region_instruction, open_region);
@@ -1368,10 +1392,41 @@ mod tests {
region_options,
region_wal_options: HashMap::new(),
skip_wal_replay: false,
reason: None,
requirements: RegionRequirements::empty(),
};
assert_eq!(expected, deserialized);
}
#[test]
fn test_serialize_open_region_with_reason_and_requirements() {
let open_region = OpenRegion::new(
RegionIdent {
datanode_id: 2,
table_id: 1024,
region_number: 1,
engine: "mito2".to_string(),
},
"test/foo",
HashMap::new(),
HashMap::new(),
false,
Some(OpenRegionReason::RegionMigration),
RegionRequirements::object_storage(),
);
let serialized = serde_json::to_string(&open_region).unwrap();
assert!(serialized.contains(r#""reason":"RegionMigration""#));
assert!(serialized.contains(r#""object_storage":true"#));
let deserialized: OpenRegion = serde_json::from_str(&serialized).unwrap();
assert_eq!(Some(OpenRegionReason::RegionMigration), deserialized.reason);
assert_eq!(
RegionRequirements::object_storage(),
deserialized.requirements
);
}
#[test]
fn test_flush_regions_creation() {
let region_id = RegionId::new(1024, 1);

View File

@@ -14,6 +14,8 @@
//! Datanode configurations
use std::time::Duration;
use common_base::readable_size::ReadableSize;
use common_config::{Configurable, DEFAULT_DATA_HOME};
use common_options::memory::MemoryOptions;
@@ -75,6 +77,10 @@ pub struct DatanodeOptions {
pub wal: DatanodeWalConfig,
pub storage: StorageConfig,
pub max_concurrent_queries: usize,
/// Timeout to acquire a permit from the concurrent query limiter when
/// `max_concurrent_queries` is reached. Only effective when the limiter is enabled.
#[serde(with = "humantime_serde")]
pub concurrent_query_limiter_timeout: Duration,
/// Options for different store engines.
pub region_engine: Vec<RegionEngineConfig>,
pub logging: LoggingOptions,
@@ -131,6 +137,7 @@ impl Default for DatanodeOptions {
wal: DatanodeWalConfig::default(),
storage: StorageConfig::default(),
max_concurrent_queries: 0,
concurrent_query_limiter_timeout: Duration::from_millis(100),
region_engine: vec![
RegionEngineConfig::Mito(MitoConfig::default()),
RegionEngineConfig::File(FileEngineConfig::default()),

View File

@@ -445,8 +445,7 @@ impl DatanodeBuilder {
event_listener,
table_provider_factory,
opts.max_concurrent_queries,
//TODO: revaluate the hardcoded timeout on the next version of datanode concurrency limiter.
Duration::from_millis(100),
opts.concurrent_query_limiter_timeout,
opts.grpc.flight_compression,
);

View File

@@ -313,7 +313,7 @@ mod tests {
use mito2::test_util::{CreateRequestBuilder, TestEnv};
use store_api::path_utils::table_dir;
use store_api::region_engine::RegionRole;
use store_api::region_request::{RegionCloseRequest, RegionRequest};
use store_api::region_request::{RegionCloseRequest, RegionRequest, RegionRequirements};
use store_api::storage::RegionId;
use tokio::sync::mpsc::{self, Receiver};
@@ -442,6 +442,8 @@ mod tests {
HashMap::new(),
HashMap::new(),
false,
None,
RegionRequirements::empty(),
)])
}

View File

@@ -14,6 +14,7 @@
use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply};
use common_meta::wal_provider::prepare_wal_options;
use common_telemetry::info;
use store_api::path_utils::table_dir;
use store_api::region_request::{PathType, RegionOpenRequest};
use store_api::storage::RegionId;
@@ -41,8 +42,13 @@ impl InstructionHandler for OpenRegionsHandler {
mut region_options,
region_wal_options,
skip_wal_replay,
reason,
requirements,
} = open_region;
let region_id = RegionId::new(region_ident.table_id, region_ident.region_number);
info!(
"Received open region instruction, region_id: {region_id}, reason: {reason:?}"
);
prepare_wal_options(&mut region_options, region_id, &region_wal_options);
let request = RegionOpenRequest {
engine: region_ident.engine,
@@ -51,6 +57,7 @@ impl InstructionHandler for OpenRegionsHandler {
options: region_options,
skip_wal_replay,
checkpoint: None,
requirements,
};
(region_id, request)
})
@@ -85,7 +92,7 @@ mod tests {
use mito2::engine::MITO_ENGINE_NAME;
use mito2::test_util::{CreateRequestBuilder, TestEnv};
use store_api::path_utils::table_dir;
use store_api::region_request::{RegionCloseRequest, RegionRequest};
use store_api::region_request::{RegionCloseRequest, RegionRequest, RegionRequirements};
use store_api::storage::RegionId;
use crate::heartbeat::handler::RegionHeartbeatResponseHandler;
@@ -98,17 +105,21 @@ mod tests {
) -> Instruction {
let region_idents = region_ids
.into_iter()
.map(|region_id| OpenRegion {
region_ident: RegionIdent {
datanode_id: 0,
table_id: region_id.table_id(),
region_number: region_id.region_number(),
engine: MITO_ENGINE_NAME.to_string(),
},
region_storage_path: storage_path.to_string(),
region_options: HashMap::new(),
region_wal_options: HashMap::new(),
skip_wal_replay: false,
.map(|region_id| {
OpenRegion::new(
RegionIdent {
datanode_id: 0,
table_id: region_id.table_id(),
region_number: region_id.region_number(),
engine: MITO_ENGINE_NAME.to_string(),
},
storage_path,
HashMap::new(),
HashMap::new(),
false,
None,
RegionRequirements::empty(),
)
})
.collect();

View File

@@ -49,6 +49,7 @@ use common_telemetry::{debug, error, info, warn};
use dashmap::DashMap;
use datafusion::datasource::TableProvider;
use datafusion_common::tree_node::TreeNode;
use datatypes::schema::SchemaRef;
use either::Either;
use futures_util::Stream;
use futures_util::future::try_join_all;
@@ -82,7 +83,7 @@ use store_api::region_request::{
RegionOpenRequest, RegionRequest,
};
use store_api::storage::RegionId;
use tokio::sync::{Semaphore, SemaphorePermit};
use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use tokio::time::timeout;
use tonic::{Request, Response, Result as TonicResult};
@@ -257,7 +258,7 @@ impl RegionServer {
request: api::v1::region::QueryRequest,
query_ctx: QueryContextRef,
) -> Result<SendableRecordBatchStream> {
let _permit = if let Some(p) = &self.inner.parallelism {
let permit = if let Some(p) = &self.inner.parallelism {
Some(p.acquire().await?)
} else {
None
@@ -298,14 +299,13 @@ impl RegionServer {
)
.await?;
Ok(wrap_flow_region_watermark_stream(
stream, region_id, &query_ctx,
))
let stream = wrap_flow_region_watermark_stream(stream, region_id, &query_ctx);
Ok(maybe_guard_stream(stream, permit))
}
#[tracing::instrument(skip_all)]
pub async fn handle_read(&self, request: QueryRequest) -> Result<SendableRecordBatchStream> {
let _permit = if let Some(p) = &self.inner.parallelism {
let permit = if let Some(p) = &self.inner.parallelism {
Some(p.acquire().await?)
} else {
None
@@ -332,9 +332,8 @@ impl RegionServer {
.handle_read(QueryRequest { plan, ..request }, query_ctx.clone())
.await?;
Ok(wrap_flow_region_watermark_stream(
stream, region_id, &query_ctx,
))
let stream = wrap_flow_region_watermark_stream(stream, region_id, &query_ctx);
Ok(maybe_guard_stream(stream, permit))
}
/// Returns all opened and reportable regions.
@@ -1058,7 +1057,7 @@ struct RegionServerInner {
}
struct RegionServerParallelism {
semaphore: Semaphore,
semaphore: Arc<Semaphore>,
timeout: Duration,
}
@@ -1071,19 +1070,68 @@ impl RegionServerParallelism {
return None;
}
Some(RegionServerParallelism {
semaphore: Semaphore::new(max_concurrent_queries),
semaphore: Arc::new(Semaphore::new(max_concurrent_queries)),
timeout: concurrent_query_limiter_timeout,
})
}
pub async fn acquire(&self) -> Result<SemaphorePermit<'_>> {
timeout(self.timeout, self.semaphore.acquire())
pub async fn acquire(&self) -> Result<OwnedSemaphorePermit> {
timeout(self.timeout, self.semaphore.clone().acquire_owned())
.await
.context(ConcurrentQueryLimiterTimeoutSnafu)?
.context(ConcurrentQueryLimiterClosedSnafu)
}
}
/// Wraps a record batch stream and holds a concurrency permit until the stream is
/// fully consumed (dropped), so `max_concurrent_queries` bounds the number of
/// in-flight read streams, not just query planning.
struct PermitGuardedStream {
inner: SendableRecordBatchStream,
_permit: OwnedSemaphorePermit,
}
impl RecordBatchStream for PermitGuardedStream {
fn name(&self) -> &str {
self.inner.name()
}
fn schema(&self) -> SchemaRef {
self.inner.schema()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
self.inner.output_ordering()
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
self.inner.metrics()
}
}
impl Stream for PermitGuardedStream {
type Item = common_recordbatch::error::Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
self.inner.as_mut().poll_next(cx)
}
}
/// Wraps `stream` so it holds `permit` until fully consumed. Returns `stream`
/// unchanged when no permit was acquired (limiter disabled).
fn maybe_guard_stream(
stream: SendableRecordBatchStream,
permit: Option<OwnedSemaphorePermit>,
) -> SendableRecordBatchStream {
match permit {
Some(permit) => Box::pin(PermitGuardedStream {
inner: stream,
_permit: permit,
}),
None => stream,
}
}
enum CurrentEngine {
Engine(RegionEngineRef),
EarlyReturn(AffectedRows),
@@ -2057,6 +2105,7 @@ mod tests {
options: Default::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -2235,6 +2284,7 @@ mod tests {
options: Default::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
},
),
(
@@ -2246,6 +2296,7 @@ mod tests {
options: Default::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
},
),
],
@@ -2268,6 +2319,7 @@ mod tests {
options: Default::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
},
),
(
@@ -2279,6 +2331,7 @@ mod tests {
options: Default::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
},
),
],

View File

@@ -175,6 +175,7 @@ pub async fn build_region_open_requests(
options,
skip_wal_replay: false,
checkpoint,
requirements: Default::default(),
},
));
}
@@ -193,6 +194,7 @@ pub async fn build_region_open_requests(
options,
skip_wal_replay: true,
checkpoint: None,
requirements: Default::default(),
},
));
}

View File

@@ -32,7 +32,7 @@ use store_api::region_engine::{
};
use store_api::region_request::{
AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
RegionRequest,
RegionRequest, RegionRequirements,
};
use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
use tokio::sync::Mutex;
@@ -186,6 +186,24 @@ struct EngineInner {
type EngineInnerRef = Arc<EngineInner>;
fn ensure_open_requirements(
requirements: RegionRequirements,
object_store: &ObjectStore,
) -> EngineResult<()> {
if !requirements.object_storage {
return Ok(());
}
ensure!(
object_store::util::is_object_storage(object_store),
UnsupportedSnafu {
operation: "open region with object storage requirement on non-object storage"
}
);
Ok(())
}
impl EngineInner {
fn new(object_store: ObjectStore) -> Self {
Self {
@@ -289,6 +307,8 @@ impl EngineInner {
return Ok(0);
}
ensure_open_requirements(request.requirements, &self.object_store)?;
let res = FileRegion::open(region_id, request, &self.object_store).await;
let region = res.inspect_err(|err| {
error!(
@@ -356,3 +376,53 @@ impl EngineInner {
self.regions.read().unwrap().contains_key(&region_id)
}
}
#[cfg(test)]
mod tests {
use object_store::services::{Fs, S3};
use super::*;
use crate::error::Error;
fn build_fs_object_store() -> ObjectStore {
ObjectStore::new(Fs::default().root("/tmp"))
.unwrap()
.finish()
}
fn build_s3_object_store() -> ObjectStore {
ObjectStore::new(
S3::default()
.bucket("test-bucket")
.region("us-east-1")
.disable_ec2_metadata(),
)
.unwrap()
.finish()
}
#[test]
fn test_empty_open_requirements_are_supported() {
ensure_open_requirements(RegionRequirements::empty(), &build_fs_object_store()).unwrap();
}
#[test]
fn test_object_storage_open_requirement_rejects_fs_object_store() {
let err = ensure_open_requirements(
RegionRequirements::object_storage(),
&build_fs_object_store(),
)
.unwrap_err();
assert!(matches!(err, Error::Unsupported { .. }));
}
#[test]
fn test_object_storage_open_requirement_accepts_s3_object_store() {
ensure_open_requirements(
RegionRequirements::object_storage(),
&build_s3_object_store(),
)
.unwrap();
}
}

View File

@@ -181,6 +181,7 @@ mod tests {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
};
let region = FileRegion::open(region_id, request, &object_store)
@@ -238,6 +239,7 @@ mod tests {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
};
let err = FileRegion::open(region_id, request, &object_store)
.await

View File

@@ -630,8 +630,11 @@ impl BatchingEngine {
let engine = self.query_engine.clone();
let frontend = self.frontend_client.clone();
// check execute once first to detect any error early
// Create sink table if needed, then validate an existing/created sink schema before
// spawning the background task. This catches user-created sink schema mismatches at
// CREATE FLOW time instead of surfacing them later in the execution loop.
task.check_or_create_sink_table(&engine, &frontend).await?;
task.validate_sink_table_schema(&engine).await?;
let (start_tx, start_rx) = oneshot::channel();

View File

@@ -265,6 +265,36 @@ impl BatchingTask {
Ok(None)
}
/// Validates that the sink table schema can accept this flow's output.
///
/// This is a dry-run of the same schema matching logic used by runtime insert-plan
/// generation, but without adding dirty-window filters or executing the query. It is used
/// during CREATE FLOW to catch existing sink table mismatches early.
pub async fn validate_sink_table_schema(&self, engine: &QueryEngineRef) -> Result<(), Error> {
let (table, _) = get_table_info_df_schema(
self.config.catalog_manager.clone(),
self.config.sink_table_name.clone(),
)
.await?;
let table_meta = &table.table_info().meta;
let merge_mode_last_non_null =
is_merge_mode_last_non_null(&table_meta.options.extra_options);
let primary_key_indices = table_meta.primary_key_indices.clone();
let query_ctx = self.state.read().unwrap().query_ctx.clone();
gen_plan_with_matching_schema(
&self.config.query,
query_ctx,
engine.clone(),
table_meta.schema.clone(),
&primary_key_indices,
merge_mode_last_non_null,
)
.await
.map(|_| ())
}
async fn is_table_exist(&self, table_name: &[String; 3]) -> Result<bool, Error> {
self.config
.catalog_manager
@@ -929,7 +959,7 @@ impl BatchingTask {
let (expire_lower_bound, expire_upper_bound) =
match (expire_time_window_bound, &self.config.query_type) {
(Some((Some(l), Some(u))), QueryType::Sql) => (l, u),
(None, QueryType::Sql) => {
(None, QueryType::Sql) if self.config.flow_eval_interval.is_none() => {
// if it's sql query and no time window lower/upper bound is found, just return the original query(with auto columns)
// use sink_table_meta to add to query the `update_at` and `__ts_placeholder` column's value too for compatibility reason
debug!(
@@ -950,7 +980,8 @@ impl BatchingTask {
}
_ => {
// Clean dirty windows for full-query/non-scoped paths,
// such as TQL, that cannot use a time-window filter.
// such as TQL or evaluation-interval SQL without a recognized
// time-window expression, that cannot use a time-window filter.
let (_, dirty_windows_to_restore) = self.drain_dirty_windows_signal();
let plan_info = self

View File

@@ -974,6 +974,38 @@ async fn test_non_scoped_path_generates_plan_with_empty_dirty_signal() {
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
}
#[tokio::test]
async fn test_no_time_window_sql_with_eval_interval_generates_plan_without_dirty_signal() {
let TestTaskParts {
mut task,
query_engine,
..
} = new_test_task_engine_and_plan_with_query(
"SELECT number, ts FROM numbers_with_ts",
"missing_sink",
)
.await;
Arc::get_mut(&mut task.config)
.expect("test task config should be uniquely owned")
.flow_eval_interval = Some(Duration::from_secs(60));
task.state.write().unwrap().dirty_time_windows.clean();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", CDT::uint32_datatype(), false),
ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
]));
let plan = task
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
.await
.unwrap()
.expect(
"eval-interval SQL without a time-window expr should run by interval, not dirty signal",
);
assert!(plan.can_advance_checkpoints);
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
}
#[tokio::test]
async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() {
let (task, plan) = new_test_task_and_plan_with_missing_sink().await;

View File

@@ -33,9 +33,10 @@ use datafusion_common::{
};
use datafusion_expr::logical_plan::{Aggregate, TableScan};
use datafusion_expr::{
Distinct, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, Projection, and, binary_expr,
bitwise_and, bitwise_or, bitwise_xor, is_null, or, when,
Distinct, ExprSchemable, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, Projection, and,
binary_expr, bitwise_and, bitwise_or, bitwise_xor, is_null, or, when,
};
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::{ColumnSchema, SchemaRef};
use query::QueryEngineRef;
use query::parser::{DEFAULT_LOOKBACK_STRING, PromQuery, QueryLanguageParser, QueryStatement};
@@ -955,7 +956,7 @@ pub(crate) async fn gen_plan_with_matching_schema(
.clone()
.rewrite(&mut add_auto_column)
.with_context(|_| DatafusionSnafu {
context: format!("Failed to rewrite plan:\n {}\n", plan),
context: "Failed to rewrite plan".to_string(),
})?
.data;
Ok(plan)
@@ -1090,33 +1091,23 @@ impl ColumnMatcherRewriter {
}
/// modify the exprs in place so that it matches the schema and some auto columns are added
fn modify_project_exprs(&mut self, mut exprs: Vec<Expr>) -> DfResult<Vec<Expr>> {
fn modify_project_exprs(
&mut self,
mut exprs: Vec<Expr>,
input_schema: &DFSchema,
) -> DfResult<Vec<Expr>> {
if self.allow_partial {
return self.modify_project_exprs_with_partial(exprs);
}
let original_exprs = exprs.clone();
let all_names = self
.schema
.column_schemas()
.iter()
.map(|c| c.name.clone())
.collect::<BTreeSet<_>>();
// first match by position
for (idx, expr) in exprs.iter_mut().enumerate() {
if !all_names.contains(&expr.qualified_name().1)
&& let Some(col_name) = self
.schema
.column_schemas()
.get(idx)
.map(|c| c.name.clone())
{
// if the data type mismatched, later check_execute will error out
// hence no need to check it here, beside, optimize pass might be able to cast it
// so checking here is not necessary
*expr = expr.clone().alias(col_name);
}
}
// add columns if have different column count
let query_col_cnt = exprs.len();
let table_col_cnt = self.schema.column_schemas().len();
@@ -1140,10 +1131,9 @@ impl ColumnMatcherRewriter {
// is the update at column
exprs.push(datafusion::prelude::now().alias(&last_col_schema.name));
} else {
// helpful error message
return Err(DataFusionError::Plan(format!(
"Expect the last column in table to be timestamp column, found column {} with type {:?}",
last_col_schema.name, last_col_schema.data_type
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
&original_exprs,
self.schema.as_ref(),
)));
}
} else if query_col_cnt + 2 == table_col_cnt {
@@ -1170,14 +1160,110 @@ impl ColumnMatcherRewriter {
)));
}
} else {
return Err(DataFusionError::Plan(format!(
"Expect table have 0,1 or 2 columns more than query columns, found {} query columns {:?}, {} table columns {:?}",
query_col_cnt,
exprs,
table_col_cnt,
self.schema.column_schemas()
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
&original_exprs,
self.schema.as_ref(),
)));
}
self.match_extra_output_columns(exprs, input_schema, &original_exprs, &all_names)
}
/// Match flow output columns whose names are not in the sink schema by the same position only.
///
/// This keeps the legacy "omit output aliases and map by position" behavior, but only when the
/// sink column at the same index is actually missing from the flow output. If the extra output
/// would be aliased to a sink column that already exists elsewhere, report a schema mismatch
/// instead of guessing another sink column by type.
///
/// In particular, this intentionally rejects cross-position remaps like
/// `record_time_window2 -> record_time_window`: they are easy to confuse with real schema
/// mismatches and should be fixed by giving the flow output the sink column name explicitly.
fn match_extra_output_columns(
&self,
mut exprs: Vec<Expr>,
input_schema: &DFSchema,
original_exprs: &[Expr],
all_names: &BTreeSet<String>,
) -> DfResult<Vec<Expr>> {
let mut output_names = exprs
.iter()
.map(|expr| expr.qualified_name().1)
.collect::<Vec<_>>();
let output_name_set = output_names.iter().cloned().collect::<BTreeSet<_>>();
let extra_expr_indices = output_names
.iter()
.enumerate()
.filter_map(|(idx, name)| (!all_names.contains(name)).then_some(idx))
.collect::<Vec<_>>();
let missing_sink_indices = self
.schema
.column_schemas()
.iter()
.enumerate()
.filter_map(|(idx, column)| (!output_name_set.contains(&column.name)).then_some(idx))
.collect::<Vec<_>>();
if extra_expr_indices.is_empty() && missing_sink_indices.is_empty() {
return Ok(exprs);
}
if extra_expr_indices.len() != missing_sink_indices.len() {
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
original_exprs,
self.schema.as_ref(),
)));
}
let mut positional_matches = Vec::new();
for expr_idx in extra_expr_indices {
if !missing_sink_indices.contains(&expr_idx) {
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
original_exprs,
self.schema.as_ref(),
)));
}
let target_col_schema = &self.schema.column_schemas()[expr_idx];
let expr_type =
ConcreteDataType::from_arrow_type(&exprs[expr_idx].get_type(input_schema)?);
if is_obviously_incompatible_positional_match(&expr_type, &target_col_schema.data_type)
{
return Err(DataFusionError::Plan(format!(
"Cannot match flow output column '{}' to sink column '{}' by position: incompatible data types, flow output type is {:?}, sink column type is {:?}. {}",
output_names[expr_idx],
target_col_schema.name,
expr_type,
target_col_schema.data_type,
format_flow_sink_schema_mismatch(original_exprs, self.schema.as_ref())
)));
}
let target_name = target_col_schema.name.clone();
positional_matches.push(format!(
"{} -> {} (flow output type: {:?}, sink column type: {:?})",
output_names[expr_idx], target_name, expr_type, target_col_schema.data_type
));
exprs[expr_idx] = exprs[expr_idx].clone().alias(target_name.clone());
output_names[expr_idx] = target_name;
}
if !positional_matches.is_empty() {
debug!(
"Matched flow output columns to sink columns by position: {:?}",
positional_matches
);
}
let duplicated_output_names = duplicate_names(&output_names);
if !duplicated_output_names.is_empty() {
return Err(DataFusionError::Plan(format!(
"Flow output schema contains duplicate column(s) after schema matching {:?}. {}",
duplicated_output_names,
format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
)));
}
Ok(exprs)
}
@@ -1186,12 +1272,9 @@ impl ColumnMatcherRewriter {
let query_col_cnt = exprs.len();
if query_col_cnt > table_col_cnt {
return Err(DataFusionError::Plan(format!(
"Expect query column count <= table column count, found {} query columns {:?}, {} table columns {:?}",
query_col_cnt,
exprs,
table_col_cnt,
self.schema.column_schemas()
return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
&exprs,
self.schema.as_ref(),
)));
}
@@ -1209,8 +1292,9 @@ impl ColumnMatcherRewriter {
.collect();
if !missing.is_empty() {
return Err(DataFusionError::Plan(format!(
"Column(s) {:?} required by sink table are missing from flow output when merge_mode=last_non_null",
missing
"Column(s) {:?} required by sink table are missing from flow output when merge_mode=last_non_null. {}",
missing,
format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
)));
}
@@ -1250,8 +1334,9 @@ impl ColumnMatcherRewriter {
if !remap.is_empty() {
let extra: Vec<_> = remap.keys().cloned().collect();
return Err(DataFusionError::Plan(format!(
"Flow output has extra column(s) {:?} not found in sink schema when merge_mode=last_non_null",
extra
"Flow output has extra column(s) {:?} not found in sink schema when merge_mode=last_non_null. {}",
extra,
format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
)));
}
@@ -1281,6 +1366,80 @@ impl ColumnMatcherRewriter {
}
}
fn is_obviously_incompatible_positional_match(
expr_type: &ConcreteDataType,
sink_type: &ConcreteDataType,
) -> bool {
// This is a coarse type-family guard for legacy positional aliasing, not a strict type equality
// check. For example, numeric width/sign differences are allowed here and left to downstream
// coercion, and untyped NULL can be coerced to any target type. Clearly different families such
// as timestamp vs string are rejected early.
if expr_type.is_null() || expr_type == sink_type {
return false;
}
expr_type.is_timestamp() != sink_type.is_timestamp()
|| expr_type.is_string() != sink_type.is_string()
|| expr_type.is_boolean() != sink_type.is_boolean()
|| expr_type.is_json() != sink_type.is_json()
|| expr_type.is_vector() != sink_type.is_vector()
}
fn duplicate_names(names: &[String]) -> Vec<String> {
let mut seen = HashSet::new();
let mut duplicated = BTreeSet::new();
for name in names {
if !seen.insert(name.as_str()) {
duplicated.insert(name.as_str());
}
}
duplicated.into_iter().map(str::to_string).collect()
}
fn format_flow_sink_schema_mismatch(
query_exprs: &[Expr],
sink_schema: &datatypes::schema::Schema,
) -> String {
let flow_output_columns = query_exprs
.iter()
.map(|expr| expr.qualified_name().1)
.collect::<Vec<_>>();
let sink_table_columns = sink_schema
.column_schemas()
.iter()
.map(|col| col.name.clone())
.collect::<Vec<_>>();
let flow_output_set = flow_output_columns.iter().cloned().collect::<HashSet<_>>();
let sink_table_set = sink_table_columns.iter().cloned().collect::<HashSet<_>>();
let mut extra_flow_columns = flow_output_columns
.iter()
.filter(|name| !sink_table_set.contains(*name))
.cloned()
.collect::<Vec<_>>();
extra_flow_columns.sort();
extra_flow_columns.dedup();
let mut missing_sink_columns = sink_table_columns
.iter()
.filter(|name| !flow_output_set.contains(*name))
.cloned()
.collect::<Vec<_>>();
missing_sink_columns.sort();
missing_sink_columns.dedup();
format!(
"Flow output schema does not match sink table schema: found {} flow output columns and {} sink table columns. flow output columns: {:?}, sink table columns: {:?}, extra flow columns not in sink: {:?}, missing sink columns from flow output: {:?}",
flow_output_columns.len(),
sink_table_columns.len(),
flow_output_columns,
sink_table_columns,
extra_flow_columns,
missing_sink_columns
)
}
impl TreeNodeRewriter for ColumnMatcherRewriter {
type Node = LogicalPlan;
fn f_down(&mut self, mut node: Self::Node) -> DfResult<Transformed<Self::Node>> {
@@ -1327,7 +1486,7 @@ impl TreeNodeRewriter for ColumnMatcherRewriter {
// if not, wrap it in a projection
if let LogicalPlan::Projection(project) = &node {
let exprs = project.expr.clone();
let exprs = self.modify_project_exprs(exprs)?;
let exprs = self.modify_project_exprs(exprs, project.input.schema())?;
self.is_rewritten = true;
let new_plan =
@@ -1341,7 +1500,7 @@ impl TreeNodeRewriter for ColumnMatcherRewriter {
field.name(),
)));
}
let exprs = self.modify_project_exprs(exprs)?;
let exprs = self.modify_project_exprs(exprs, node.schema())?;
self.is_rewritten = true;
let new_plan =
LogicalPlan::Projection(Projection::try_new(exprs, Arc::new(node.clone()))?);

View File

@@ -14,6 +14,7 @@
use std::sync::Arc;
use catalog::RegisterTableRequest;
use common_recordbatch::RecordBatch;
use common_time::Timestamp;
use datafusion_common::tree_node::TreeNode as _;
@@ -29,7 +30,9 @@ use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
use table::test_util::MemTable;
use super::*;
use crate::batching_mode::BatchingModeOptions;
use crate::batching_mode::state::FilterExprInfo;
use crate::batching_mode::task::{BatchingTask, TaskArgs};
use crate::test_utils::create_test_query_engine;
fn u32_table(table_name: &str, columns: Vec<&str>, rows: usize) -> TableRef {
@@ -432,9 +435,7 @@ async fn test_add_auto_column_rewriter() {
// error datatype mismatch
(
"SELECT number, ts FROM numbers_with_ts",
Err(
"Expect the last column in table to be timestamp column, found column atat with type Int8",
),
Err("missing sink columns from flow output: [\"atat\"]"),
vec![
ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
ColumnSchema::new(
@@ -498,6 +499,383 @@ async fn test_add_auto_column_rewriter() {
}
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_reports_extra_flow_columns_before_positional_alias() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new(
"max(numbers_with_ts.number)",
ConcreteDataType::uint32_datatype(),
true,
),
]));
let err = gen_plan_with_matching_schema(
"SELECT number, number AS extra, ts, max(number) FROM numbers_with_ts GROUP BY number, ts",
ctx,
query_engine,
sink_schema,
&[],
false,
)
.await
.unwrap_err()
.to_string();
assert!(
err.contains("Flow output schema does not match sink table schema"),
"{err}"
);
assert!(err.contains("flow output columns"), "{err}");
assert!(err.contains("sink table columns"), "{err}");
assert!(err.contains("extra flow columns not in sink"), "{err}");
assert!(err.contains("extra"), "{err}");
assert!(
!err.contains("extra AS ts"),
"schema error should not primarily expose positional alias: {err}"
);
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_rejects_positional_alias_type_mismatch() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(
"event_time",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new(
"max(numbers_with_ts.number)",
ConcreteDataType::uint32_datatype(),
true,
),
]));
let err = gen_plan_with_matching_schema(
"SELECT number, number AS not_time, max(number) FROM numbers_with_ts GROUP BY number",
ctx,
query_engine,
sink_schema,
&[],
false,
)
.await
.unwrap_err()
.to_string();
assert!(
err.contains(
"Cannot match flow output column 'not_time' to sink column 'event_time' by position"
),
"{err}"
);
assert!(err.contains("incompatible data types"), "{err}");
assert!(
!err.contains("not_time AS event_time"),
"schema error should not expose an incompatible positional alias: {err}"
);
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_rejects_cross_position_extra_column_match() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(
"time_window",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
true,
),
]));
let err = gen_plan_with_matching_schema(
"SELECT number, ts, date_bin('5 minutes', ts) AS time_window2 FROM numbers_with_ts GROUP BY number, ts, time_window2",
ctx,
query_engine,
sink_schema,
&[],
false,
)
.await
.unwrap_err()
.to_string();
assert!(
err.contains("Flow output schema does not match sink table schema"),
"{err}"
);
assert!(err.contains("time_window2"), "{err}");
assert!(err.contains("time_window"), "{err}");
assert!(!err.contains("DuplicateUnqualifiedField"), "{err}");
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_accepts_out_of_order_matching_names() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(
"time_window",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
true,
),
]));
let plan = gen_plan_with_matching_schema(
"SELECT number, ts, date_bin('5 minutes', ts) AS time_window FROM numbers_with_ts GROUP BY number, ts, time_window",
ctx,
query_engine,
sink_schema,
&[],
false,
)
.await
.unwrap();
let output_names = plan
.schema()
.fields()
.iter()
.map(|field| field.name().clone())
.collect::<Vec<_>>();
assert_eq!(
output_names,
vec![
"number".to_string(),
"ts".to_string(),
"time_window".to_string()
]
);
assert!(duplicate_names(&output_names).is_empty());
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_allows_numeric_positional_alias() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("renamed_number", ConcreteDataType::int64_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
]));
let plan = gen_plan_with_matching_schema(
"SELECT number, ts FROM numbers_with_ts",
ctx,
query_engine,
sink_schema,
&[],
false,
)
.await
.unwrap();
let sql = df_plan_to_sql(&plan).unwrap();
assert_eq!(
"SELECT numbers_with_ts.number AS renamed_number, numbers_with_ts.ts FROM numbers_with_ts",
sql
);
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_allows_null_positional_alias() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new("label", ConcreteDataType::string_datatype(), true),
]));
let plan = gen_plan_with_matching_schema(
"SELECT number, NULL AS label_placeholder FROM numbers_with_ts",
ctx,
query_engine,
sink_schema,
&[],
false,
)
.await
.unwrap();
let output_names = plan
.schema()
.fields()
.iter()
.map(|field| field.name().clone())
.collect::<Vec<_>>();
let sql = df_plan_to_sql(&plan).unwrap();
assert_eq!(
output_names,
vec!["number".to_string(), "label".to_string()]
);
assert!(sql.contains("NULL AS label"), "{sql}");
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_accepts_matching_flow_schema() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new("extra", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new(
"max(numbers_with_ts.number)",
ConcreteDataType::uint32_datatype(),
true,
),
]));
let plan = gen_plan_with_matching_schema(
"SELECT number, number AS extra, ts, max(number) FROM numbers_with_ts GROUP BY number, ts",
ctx,
query_engine,
sink_schema,
&[],
false,
)
.await
.unwrap();
let sql = df_plan_to_sql(&plan).unwrap();
assert_eq!(
"SELECT numbers_with_ts.number, numbers_with_ts.number AS extra, numbers_with_ts.ts, max(numbers_with_ts.number) FROM numbers_with_ts GROUP BY numbers_with_ts.number, numbers_with_ts.ts",
sql
);
}
#[tokio::test]
async fn test_validate_sink_table_schema_rejects_existing_sink_missing_flow_column() {
let query_engine = create_test_query_engine();
let query_ctx = QueryContext::arc();
let sql = "SELECT number, number AS extra, max(number) FROM numbers_with_ts GROUP BY number";
let plan = sql_to_df_plan(query_ctx.clone(), query_engine.clone(), sql, true)
.await
.unwrap();
let catalog_manager = catalog::memory::new_memory_catalog_manager().unwrap();
let sink_table_name = [
"greptime".to_string(),
"public".to_string(),
"existing_sink".to_string(),
];
let sink_table = u32_table(
"existing_sink",
vec!["number", "max(numbers_with_ts.number)"],
0,
);
catalog_manager
.register_table_sync(RegisterTableRequest {
catalog: sink_table_name[0].clone(),
schema: sink_table_name[1].clone(),
table_name: sink_table_name[2].clone(),
table_id: 4096,
table: sink_table,
})
.unwrap();
let (_shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
let task = BatchingTask::try_new(TaskArgs {
flow_id: 1,
query: sql,
plan,
time_window_expr: None,
expire_after: None,
sink_table_name,
source_table_names: vec![[
"greptime".to_string(),
"public".to_string(),
"numbers_with_ts".to_string(),
]],
query_ctx,
catalog_manager,
shutdown_rx,
batch_opts: Arc::new(BatchingModeOptions::default()),
flow_eval_interval: None,
})
.unwrap();
let err = task
.validate_sink_table_schema(&query_engine)
.await
.unwrap_err()
.to_string();
assert!(
err.contains("Flow output schema does not match sink table schema"),
"{err}"
);
assert!(err.contains("extra"), "{err}");
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_allow_partial_fills_nullable_columns() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), false),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
]));
let plan = gen_plan_with_matching_schema(
"SELECT number, ts FROM numbers_with_ts",
ctx,
query_engine,
sink_schema,
&[0],
true,
)
.await
.unwrap();
let sql = df_plan_to_sql(&plan).unwrap();
assert_eq!(
"SELECT numbers_with_ts.number, numbers_with_ts.ts, NULL AS optional_value FROM numbers_with_ts",
sql
);
}
#[tokio::test]
async fn test_find_group_by_exprs() {
let testcases = vec![
@@ -1491,3 +1869,118 @@ async fn test_analyze_incremental_aggregate_plan_rejects_cast_wrapped_alias() {
);
}
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_last_non_null_rejects_missing_primary_key_column() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
// Sink table with primary_key_indices=[0] ("number"), time_index="ts", and merge_mode=last_non_null.
// The flow query omits "number", which is a required primary-key column.
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
]));
let err = gen_plan_with_matching_schema(
"SELECT ts FROM numbers_with_ts",
ctx,
query_engine,
sink_schema,
&[0],
true,
)
.await
.unwrap_err()
.to_string();
assert!(
err.contains(
"required by sink table are missing from flow output when merge_mode=last_non_null"
),
"{err}"
);
assert!(err.contains("number"), "{err}");
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_last_non_null_rejects_missing_time_index_column() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
// Sink table with primary_key_indices=[0] ("number"), time_index="ts", and merge_mode=last_non_null.
// The flow query omits "ts", which is a required time-index column.
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
]));
let err = gen_plan_with_matching_schema(
"SELECT number FROM numbers_with_ts",
ctx,
query_engine,
sink_schema,
&[0],
true,
)
.await
.unwrap_err()
.to_string();
assert!(
err.contains(
"required by sink table are missing from flow output when merge_mode=last_non_null"
),
"{err}"
);
assert!(err.contains("ts"), "{err}");
}
#[tokio::test]
async fn test_gen_plan_with_matching_schema_last_non_null_rejects_extra_flow_column() {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
// Sink table with merge_mode=last_non_null.
// Sink has 3 columns: number (pk), ts (time_index), optional_value (nullable).
// Flow outputs: number, number AS extra, ts → "extra" is not in sink schema.
// query_col_cnt(3) <= table_col_cnt(3), so the extra branch is reached.
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
)
.with_time_index(true),
ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
]));
let err = gen_plan_with_matching_schema(
"SELECT number, number AS extra, ts FROM numbers_with_ts",
ctx,
query_engine,
sink_schema,
&[0],
true,
)
.await
.unwrap_err()
.to_string();
assert!(err.contains("extra column(s)"), "{err}");
assert!(err.contains("extra"), "{err}");
assert!(
err.contains("Flow output schema does not match sink table schema"),
"{err}"
);
}

View File

@@ -288,7 +288,6 @@ where
let http_server = builder
.with_metrics_handler(MetricsHandler)
.with_plugins(self.plugins.clone())
.with_greptime_config_options(toml)
.build();
Ok(http_server)

View File

@@ -1344,7 +1344,7 @@ mod tests {
// Generates rough 10MB data, which is larger than the default grpc message size limit.
for i in 0..10 {
let data: Vec<u8> = (0..1024 * 1024).map(|_| rng.random()).collect();
let data: Vec<u8> = (0..1024 * 1024).map(|_| rng.random::<u8>()).collect();
in_memory
.put(
PutRequest::new()

View File

@@ -18,7 +18,9 @@ use std::ops::Div;
use api::v1::meta::MailboxMessage;
use common_meta::RegionIdent;
use common_meta::distributed_time_constants::default_distributed_time_constants;
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
use common_meta::instruction::{
Instruction, InstructionReply, OpenRegion, OpenRegionReason, SimpleReply,
};
use common_meta::key::datanode_table::RegionInfo;
use common_procedure::{Context as ProcedureContext, Status};
use common_telemetry::info;
@@ -26,12 +28,13 @@ use common_telemetry::tracing_context::TracingContext;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use store_api::region_engine::RegionRole;
use store_api::region_request::RegionRequirements;
use tokio::time::Instant;
use crate::error::{self, Result};
use crate::handler::HeartbeatMailbox;
use crate::procedure::region_migration::flush_leader_region::PreFlushRegion;
use crate::procedure::region_migration::{Context, State};
use crate::procedure::region_migration::{Context, RegionMigrationTriggerReason, State};
use crate::service::mailbox::Channel;
#[derive(Debug, Serialize, Deserialize)]
@@ -67,6 +70,10 @@ impl OpenCandidateRegion {
let region_ids = ctx.persistent_ctx.region_ids.clone();
let from_peer_id = ctx.persistent_ctx.from_peer.id;
let to_peer_id = ctx.persistent_ctx.to_peer.id;
let reason = match ctx.persistent_ctx.trigger_reason {
RegionMigrationTriggerReason::Failover => OpenRegionReason::RegionFailover,
_ => OpenRegionReason::RegionMigration,
};
let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
let mut open_regions = Vec::with_capacity(region_ids.len());
@@ -97,6 +104,8 @@ impl OpenCandidateRegion {
region_options,
region_wal_options,
true,
Some(reason),
RegionRequirements::object_storage(),
));
}
@@ -233,18 +242,20 @@ mod tests {
}
fn new_mock_open_instruction(datanode_id: DatanodeId, region_id: RegionId) -> Instruction {
Instruction::OpenRegions(vec![OpenRegion {
region_ident: RegionIdent {
Instruction::OpenRegions(vec![OpenRegion::new(
RegionIdent {
datanode_id,
table_id: region_id.table_id(),
region_number: region_id.region_number(),
engine: MITO2_ENGINE.to_string(),
},
region_storage_path: "/bar/foo/region/".to_string(),
region_options: Default::default(),
region_wal_options: Default::default(),
skip_wal_replay: true,
}])
"/bar/foo/region/",
Default::default(),
Default::default(),
true,
Some(OpenRegionReason::RegionMigration),
RegionRequirements::object_storage(),
)])
}
#[tokio::test]
@@ -263,6 +274,57 @@ mod tests {
assert!(!err.is_retryable());
}
#[tokio::test]
async fn test_build_open_region_instruction_reason() {
let state = OpenCandidateRegion;
let mut persistent_context = new_persistent_context();
let from_peer_id = persistent_context.from_peer.id;
let region_id = persistent_context.region_ids[0];
let env = TestingEnv::new();
let table_info = new_test_table_info(1024);
let region_routes = vec![RegionRoute {
region: Region::new_test(region_id),
leader_peer: Some(Peer::empty(from_peer_id)),
..Default::default()
}];
env.table_metadata_manager()
.create_table_metadata(
table_info,
TableRouteValue::physical(region_routes),
HashMap::default(),
)
.await
.unwrap();
let mut ctx = env
.context_factory()
.new_context(persistent_context.clone());
let instruction = state.build_open_region_instruction(&mut ctx).await.unwrap();
let open_regions = instruction.into_open_regions().unwrap();
assert_eq!(
Some(OpenRegionReason::RegionMigration),
open_regions[0].reason
);
assert_eq!(
RegionRequirements::object_storage(),
open_regions[0].requirements
);
persistent_context.trigger_reason = RegionMigrationTriggerReason::Failover;
let mut ctx = env.context_factory().new_context(persistent_context);
let instruction = state.build_open_region_instruction(&mut ctx).await.unwrap();
let open_regions = instruction.into_open_regions().unwrap();
assert_eq!(
Some(OpenRegionReason::RegionFailover),
open_regions[0].reason
);
assert_eq!(
RegionRequirements::object_storage(),
open_regions[0].requirements
);
}
#[tokio::test]
async fn test_datanode_is_unreachable() {
let state = OpenCandidateRegion;

View File

@@ -620,6 +620,7 @@ mod test {
options: physical_region_option,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
};
engine
.handle_request(physical_region_id, RegionRequest::Open(open_request))
@@ -644,6 +645,7 @@ mod test {
options: HashMap::new(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
};
engine
.handle_request(
@@ -721,6 +723,7 @@ mod test {
options: physical_region_option,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
};
// Opening an already opened region should succeed.
// Since the region is already open, no metadata recovery operations will be performed.
@@ -749,6 +752,7 @@ mod test {
options: physical_region_option,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
};
let err = metric_engine
.handle_request(physical_region_id, RegionRequest::Open(open_request))
@@ -854,6 +858,7 @@ mod test {
options: options.clone(),
skip_wal_replay: true,
checkpoint: None,
requirements: Default::default(),
},
)
})

View File

@@ -222,6 +222,7 @@ impl MetricEngineInner {
entry_id: checkpoint.metadata_entry_id.unwrap_or_default(),
metadata_entry_id: None,
}),
requirements: request.requirements,
};
let mut data_region_options = request.options;
@@ -239,6 +240,7 @@ impl MetricEngineInner {
entry_id: checkpoint.entry_id,
metadata_entry_id: None,
}),
requirements: request.requirements,
};
(open_metadata_region_request, open_data_region_request)

View File

@@ -321,6 +321,7 @@ mod tests {
options: physical_region_option,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -144,6 +144,7 @@ impl TestEnv {
options: physical_region_option,
skip_wal_replay: true,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -8,6 +8,7 @@ license.workspace = true
default = []
test = ["common-test-util", "rstest", "rstest_reuse", "rskafka"]
testing = ["test"]
test-shared-fs-region-migration = []
enterprise = []
vector_index = ["dep:roaring", "index/vector_index"]

View File

@@ -277,6 +277,7 @@ async fn test_alter_region_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -481,6 +482,7 @@ async fn test_put_after_alter_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -844,6 +846,7 @@ async fn test_alter_column_fulltext_options_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -979,6 +982,7 @@ async fn test_alter_column_set_inverted_index_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -1248,6 +1252,7 @@ async fn test_alter_region_sst_format_with_flush() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -1366,6 +1371,7 @@ async fn test_alter_region_sst_format_without_flush() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -1492,6 +1498,7 @@ async fn test_alter_region_sst_format_flat_to_pk_with_flush() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -1610,6 +1617,7 @@ async fn test_alter_region_sst_format_flat_to_pk_without_flush() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -1725,6 +1733,7 @@ async fn test_alter_region_append_mode_with_flush() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -1843,6 +1852,7 @@ async fn test_alter_region_append_mode_without_flush() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -348,6 +348,7 @@ async fn test_alter_append_mode_clears_merge_mode_with_format(flat_format: bool)
options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -196,6 +196,7 @@ async fn test_region_replay_with_format(factory: Option<LogStoreFactory>, flat_f
options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -160,6 +160,7 @@ async fn test_batch_catchup_with_format(factory: Option<LogStoreFactory>, flat_f
skip_wal_replay: true,
path_type: PathType::Bare,
checkpoint: None,
requirements: Default::default(),
},
)
})

View File

@@ -136,6 +136,7 @@ async fn test_batch_open_with_format(factory: Option<LogStoreFactory>, flat_form
skip_wal_replay: false,
path_type: PathType::Bare,
checkpoint: None,
requirements: Default::default(),
},
)
})
@@ -149,6 +150,7 @@ async fn test_batch_open_with_format(factory: Option<LogStoreFactory>, flat_form
skip_wal_replay: false,
path_type: PathType::Bare,
checkpoint: None,
requirements: Default::default(),
},
));
@@ -221,6 +223,7 @@ async fn test_batch_open_err_with_format(factory: Option<LogStoreFactory>, flat_
skip_wal_replay: false,
path_type: PathType::Bare,
checkpoint: None,
requirements: Default::default(),
},
)
})

View File

@@ -112,6 +112,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -151,6 +152,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -97,6 +97,7 @@ async fn test_catchup_with_last_entry_id(factory: Option<LogStoreFactory>) {
options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -218,6 +219,7 @@ async fn test_catchup_with_incorrect_last_entry_id(factory: Option<LogStoreFacto
options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -321,6 +323,7 @@ async fn test_catchup_without_last_entry_id(factory: Option<LogStoreFactory>) {
options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -423,6 +426,7 @@ async fn test_catchup_with_manifest_update(factory: Option<LogStoreFactory>) {
options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -527,6 +531,7 @@ async fn open_region(
skip_wal_replay,
path_type: PathType::Bare,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -622,6 +627,7 @@ async fn test_local_catchup(factory: Option<LogStoreFactory>) {
skip_wal_replay: true,
path_type: PathType::Bare,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -1023,6 +1023,7 @@ async fn test_change_region_compaction_window_with_format(flat_format: bool) {
options: Default::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -1125,6 +1126,7 @@ async fn test_open_overwrite_compaction_window_with_format(flat_format: bool) {
options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -64,6 +64,7 @@ async fn test_engine_open_empty_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -110,6 +111,7 @@ async fn test_engine_open_existing_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -237,6 +239,7 @@ async fn test_engine_region_open_with_options_with_format(flat_format: bool) {
options: HashMap::from([("ttl".to_string(), "4d".to_string())]),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -297,6 +300,7 @@ async fn test_engine_region_open_with_custom_store_with_format(flat_format: bool
options: HashMap::from([("storage".to_string(), "Gcs".to_string())]),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -392,6 +396,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) {
options: Default::default(),
skip_wal_replay: true,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -431,6 +436,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) {
options: Default::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -484,6 +490,7 @@ async fn test_open_region_wait_for_opening_region_ok_with_format(flat_format: bo
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -535,6 +542,7 @@ async fn test_open_region_wait_for_opening_region_err_with_format(flat_format: b
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -691,6 +699,7 @@ async fn test_open_backfills_partition_expr_with_fetcher() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -725,6 +734,7 @@ async fn test_open_backfills_partition_expr_with_fetcher() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -766,6 +776,7 @@ async fn test_open_keeps_none_without_fetcher() {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -52,6 +52,7 @@ async fn scan_in_parallel(
skip_wal_replay: false,
path_type: PathType::Bare,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -87,6 +87,7 @@ async fn test_close_region_skip_wal(insert: bool) {
options: request.options.clone(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -154,6 +155,7 @@ async fn test_close_follower_region_skip_wal() {
options: request.options.clone(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -271,6 +273,7 @@ async fn test_close_region_after_truncate_skip_wal() {
options: request.options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -127,6 +127,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
// Ensure the region is not replayed from the WAL.
skip_wal_replay: true,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -239,6 +240,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) {
// Ensure the region is not replayed from the WAL.
skip_wal_replay: true,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -323,6 +323,7 @@ async fn test_engine_truncate_reopen_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await
@@ -447,6 +448,7 @@ async fn test_engine_truncate_during_flush_with_format(flat_format: bool) {
options: HashMap::default(),
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -916,6 +916,20 @@ pub enum Error {
source: Arc<Error>,
},
#[snafu(display(
"Region {} does not satisfy open requirement '{}': {}",
region_id,
requirement,
reason
))]
OpenRegionRequirement {
region_id: RegionId,
requirement: &'static str,
reason: &'static str,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to parse job id"))]
ParseJobId {
#[snafu(implicit)]
@@ -1376,6 +1390,7 @@ impl ErrorExt for Error {
PrimaryKeyLengthMismatch { .. } => StatusCode::InvalidArguments,
InvalidSender { .. } => StatusCode::InvalidArguments,
InvalidSchedulerState { .. } => StatusCode::InvalidArguments,
OpenRegionRequirement { .. } => StatusCode::InvalidArguments,
DeleteSsts { .. } | DeleteIndex { .. } | DeleteIndexes { .. } => {
StatusCode::StorageUnavailable
}

View File

@@ -27,8 +27,9 @@ use futures::future::BoxFuture;
use log_store::kafka::log_store::KafkaLogStore;
use log_store::noop::log_store::NoopLogStore;
use log_store::raft_engine::log_store::RaftEngineLogStore;
use object_store::ObjectStore;
use object_store::manager::ObjectStoreManagerRef;
use object_store::util::normalize_dir;
use object_store::util::{is_object_storage, normalize_dir};
use snafu::{OptionExt, ResultExt, ensure};
use store_api::logstore::LogStore;
use store_api::logstore::provider::Provider;
@@ -36,7 +37,7 @@ use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
};
use store_api::region_engine::RegionRole;
use store_api::region_request::PathType;
use store_api::region_request::{PathType, RegionRequirements};
use store_api::storage::{ColumnId, RegionId};
use tokio::sync::Semaphore;
@@ -46,8 +47,8 @@ use crate::cache::file_cache::{FileCache, FileType, IndexKey};
use crate::config::MitoConfig;
use crate::error;
use crate::error::{
EmptyRegionDirSnafu, InvalidMetadataSnafu, ObjectStoreNotFoundSnafu, RegionCorruptedSnafu,
Result, StaleLogEntrySnafu,
EmptyRegionDirSnafu, InvalidMetadataSnafu, InvalidRegionOptionsSnafu, ObjectStoreNotFoundSnafu,
RegionCorruptedSnafu, Result, StaleLogEntrySnafu,
};
use crate::manifest::action::RegionManifest;
use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
@@ -206,6 +207,29 @@ impl RegionOpener {
Ok(self)
}
/// Ensures the current region open request satisfies its requirements.
pub(crate) fn ensure_open_requirements(&self, requirements: RegionRequirements) -> Result<()> {
if !requirements.object_storage {
return Ok(());
}
let options = self.options.as_ref().context(InvalidRegionOptionsSnafu {
reason: "missing region options before requirement check".to_string(),
})?;
let object_store = get_object_store(&options.storage, &self.object_store_manager)?;
ensure!(
supports_open_region_object_storage_requirement(&object_store),
error::OpenRegionRequirementSnafu {
region_id: self.region_id,
requirement: "object storage",
reason: "region data must be accessible from another datanode",
}
);
Ok(())
}
/// Sets the cache manager for the region.
pub(crate) fn cache(mut self, cache_manager: Option<CacheManagerRef>) -> Self {
self.cache_manager = cache_manager;
@@ -597,6 +621,21 @@ impl RegionOpener {
}
}
#[cfg(not(feature = "test-shared-fs-region-migration"))]
fn supports_open_region_object_storage_requirement(object_store: &ObjectStore) -> bool {
is_object_storage(object_store)
}
#[cfg(feature = "test-shared-fs-region-migration")]
fn supports_open_region_object_storage_requirement(object_store: &ObjectStore) -> bool {
// Integration tests can configure multiple datanodes to share the same
// temporary home dir. That makes file storage accessible to all test
// datanodes, but production file storage still does not satisfy this
// requirement.
is_object_storage(object_store)
|| object_store.info().scheme() == object_store::services::FS_SCHEME
}
/// Creates a version builder from a region manifest.
pub(crate) fn version_builder_from_manifest(
manifest: &RegionManifest,
@@ -1172,14 +1211,17 @@ mod tests {
use datatypes::arrow::array::{ArrayRef, BinaryArray, Int64Array};
use datatypes::arrow::record_batch::RecordBatch;
use object_store::ObjectStore;
use object_store::services::{Fs, Memory};
use object_store::services::{Fs, Memory, S3};
use parquet::arrow::ArrowWriter;
use parquet::file::metadata::KeyValue;
use parquet::file::properties::WriterProperties;
use store_api::region_request::PathType;
use store_api::storage::{FileId, RegionId};
use super::{preload_parquet_meta_cache_for_files, sanitize_region_options};
use super::{
preload_parquet_meta_cache_for_files, sanitize_region_options,
supports_open_region_object_storage_requirement,
};
use crate::cache::CacheManager;
use crate::cache::file_cache::{FileType, IndexKey};
use crate::manifest::action::{RegionManifest, RemovedFilesRecord};
@@ -1207,6 +1249,48 @@ mod tests {
}
}
fn build_fs_object_store() -> ObjectStore {
ObjectStore::new(Fs::default().root("/tmp"))
.unwrap()
.finish()
}
#[test]
#[cfg(not(feature = "test-shared-fs-region-migration"))]
fn test_open_requirement_rejects_fs_object_store() {
let object_store = build_fs_object_store();
assert!(!supports_open_region_object_storage_requirement(
&object_store
));
}
#[test]
#[cfg(feature = "test-shared-fs-region-migration")]
fn test_open_requirement_accepts_shared_fs_object_store_for_tests() {
let object_store = build_fs_object_store();
assert!(supports_open_region_object_storage_requirement(
&object_store
));
}
#[test]
fn test_open_requirement_accepts_s3_object_store() {
let object_store = ObjectStore::new(
S3::default()
.bucket("test-bucket")
.region("us-east-1")
.disable_ec2_metadata(),
)
.unwrap()
.finish();
assert!(supports_open_region_object_storage_requirement(
&object_store
));
}
#[test]
fn test_sanitize_region_options_options_format_wins() {
// Manifest persisted PrimaryKey, but the re-parsed options now request Flat

View File

@@ -1307,6 +1307,7 @@ pub async fn reopen_region(
skip_wal_replay: false,
path_type: PathType::Bare,
checkpoint: None,
requirements: Default::default(),
}),
)
.await

View File

@@ -87,14 +87,11 @@ impl<S: LogStore> RegionWorkerLoop<S> {
else {
return;
};
if let Err(err) = self.check_and_cleanup_region(region_id, &request).await {
sender.send(Err(err));
return;
}
info!("Try to open region {}, worker: {}", region_id, self.id);
sanitize_open_request_options(&mut request.options);
// Open region from specific region dir.
let requirements = request.requirements;
let opener = match RegionOpener::new(
region_id,
&request.table_dir,
@@ -112,7 +109,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
.cache(Some(self.cache_manager.clone()))
.wal_entry_reader(wal_entry_receiver.map(|receiver| Box::new(receiver) as _))
.replay_checkpoint(request.checkpoint.map(|checkpoint| checkpoint.entry_id))
.parse_options(request.options)
.parse_options(request.options.clone())
{
Ok(opener) => opener,
Err(err) => {
@@ -121,6 +118,16 @@ impl<S: LogStore> RegionWorkerLoop<S> {
}
};
if let Err(err) = opener.ensure_open_requirements(requirements) {
sender.send(Err(err));
return;
}
if let Err(err) = self.check_and_cleanup_region(region_id, &request).await {
sender.send(Err(err));
return;
}
let now = Instant::now();
let regions = self.regions.clone();
let wal = self.wal.clone();

View File

@@ -22,11 +22,17 @@ use opendal::layers::{
LoggingInterceptor, LoggingLayer, RetryEvent, RetryInterceptor, RetryLayer, TracingLayer,
};
use opendal::raw::{AccessorInfo, HttpClient, Operation};
use opendal::services::FS_SCHEME;
use snafu::ResultExt;
use crate::config::HttpClientConfig;
use crate::{ObjectStore, error};
/// Returns true if the object store is not backed by local filesystem.
pub fn is_object_storage(object_store: &ObjectStore) -> bool {
object_store.info().scheme() != FS_SCHEME
}
/// Join two paths and normalize the output dir.
///
/// The output dir is always ends with `/`. e.g.
@@ -249,7 +255,11 @@ impl RetryInterceptor for PrintDetailedError {
#[cfg(test)]
mod tests {
use opendal::services::Fs;
use super::*;
use crate::ObjectStore;
use crate::util::is_object_storage;
#[test]
fn test_normalize_dir() {
@@ -289,4 +299,14 @@ mod tests {
assert_eq!("/abc", join_path("//", "/abc"));
assert_eq!("abc/def", join_path("abc/", "//def"));
}
#[test]
fn test_fs_is_not_object_storage() {
let object_store = ObjectStore::new(Fs::default().root("/tmp"))
.unwrap()
.finish();
assert_eq!(FS_SCHEME, object_store.info().scheme());
assert!(!is_object_storage(&object_store));
}
}

View File

@@ -15,11 +15,15 @@
use std::collections::HashMap;
use std::future::Future;
use std::path::Path;
use std::pin::Pin;
use std::sync::Arc;
use std::task::{Context, Poll};
use client::{Output, OutputData, OutputMeta};
use common_base::readable_size::ReadableSize;
use common_datasource::file_format::csv::CsvFormat;
use common_datasource::file_format::csv::{
CsvFormat, is_skippable_arrow_error, tolerant_csv_stream,
};
use common_datasource::file_format::json::JsonFormat;
use common_datasource::file_format::orc::{ReaderAdapter, infer_orc_schema, new_orc_stream_reader};
use common_datasource::file_format::{FileFormat, Format, file_to_stream};
@@ -33,10 +37,13 @@ use common_telemetry::{debug, tracing};
use datafusion::datasource::physical_plan::{CsvSource, FileSource, JsonSource};
use datafusion::parquet::arrow::ParquetRecordBatchStreamBuilder;
use datafusion::parquet::arrow::arrow_reader::ArrowReaderMetadata;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::error::ArrowError;
use datafusion_common::config::CsvOptions;
use datafusion_expr::Expr;
use datatypes::arrow::compute::can_cast_types;
use datatypes::arrow::datatypes::{DataType as ArrowDataType, Schema, SchemaRef};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::vectors::Helper;
use futures_util::StreamExt;
use object_store::{Entry, EntryMode, ObjectStore};
@@ -221,23 +228,42 @@ impl StatementExecutor {
let csv_source = CsvSource::new(schema.clone())
.with_csv_options(options)
.with_batch_size(DEFAULT_BATCH_SIZE);
let stream = file_to_stream(
object_store,
path,
csv_source,
Some(projection),
format.compression_type,
)
.await
.context(error::BuildFileStreamSnafu)?;
let stream = if format.skip_bad_records {
let reader_schema =
csv_reader_schema_for_skip_bad_records(schema, &compat_schema);
tolerant_csv_stream(
object_store,
path,
Arc::new(reader_schema),
projection.clone(),
format,
)
.await
.context(error::BuildFileStreamSnafu)?
} else {
file_to_stream(
object_store,
path,
csv_source,
Some(projection),
format.compression_type,
)
.await
.context(error::BuildFileStreamSnafu)?
};
Ok(Box::pin(
let stream = Box::pin(
// The projection is already applied in the CSV reader when we created the stream,
// so we pass None here to avoid double projection which would cause schema mismatch errors.
RecordBatchStreamTypeAdapter::new(output_schema, stream, None)
.with_filter(filters)
.context(error::PhysicalExprSnafu)?,
))
);
if format.skip_bad_records {
Ok(Box::pin(SkipBadRecordsStream::new(stream, path)))
} else {
Ok(stream)
}
}
FileMetadata::Json {
path,
@@ -469,6 +495,58 @@ fn gen_insert_output(rows_inserted: usize, insert_cost: usize) -> Output {
)
}
struct SkipBadRecordsStream {
inner: DfSendableRecordBatchStream,
path: String,
}
impl SkipBadRecordsStream {
fn new(inner: DfSendableRecordBatchStream, path: impl Into<String>) -> Self {
Self {
inner,
path: path.into(),
}
}
}
impl datafusion::physical_plan::RecordBatchStream for SkipBadRecordsStream {
fn schema(&self) -> SchemaRef {
self.inner.schema()
}
}
impl futures::Stream for SkipBadRecordsStream {
type Item = datafusion_common::Result<RecordBatch>;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
let this = self.get_mut();
loop {
match this.inner.as_mut().poll_next(cx) {
Poll::Ready(Some(Err(error))) if is_skippable_record_error(&error) => {
common_telemetry::warn!(
"Skipping bad record while copying from {}: {}",
this.path,
error
);
continue;
}
other => return other,
}
}
}
}
fn is_skippable_record_error(error: &DataFusionError) -> bool {
match error {
DataFusionError::ArrowError(error, _) => is_skippable_arrow_error(error),
DataFusionError::External(error) => error
.downcast_ref::<ArrowError>()
.is_some_and(is_skippable_arrow_error),
DataFusionError::Context(_, error) => is_skippable_record_error(error),
_ => false,
}
}
/// Executes all pending inserts all at once, drain pending requests and reset pending bytes.
async fn batch_insert(
pending: &mut Vec<impl Future<Output = Result<Output>>>,
@@ -498,6 +576,59 @@ fn can_cast_types_for_greptime(from: &ArrowDataType, to: &ArrowDataType) -> bool
can_cast_types(from, to)
}
fn csv_reader_schema_for_skip_bad_records(file: &SchemaRef, compat: &SchemaRef) -> Schema {
let fields = file
.fields()
.iter()
.enumerate()
.map(|(idx, file_field)| {
let compat_field = compat
.fields()
.find(file_field.name())
.map(|(_, field)| field);
match compat_field {
Some(compat_field) if can_csv_reader_parse_type(compat_field.data_type()) => {
compat_field.clone()
}
_ => file.fields()[idx].clone(),
}
})
.collect::<Vec<_>>();
Schema::new_with_metadata(fields, file.metadata().clone())
}
fn can_csv_reader_parse_type(data_type: &ArrowDataType) -> bool {
match data_type {
ArrowDataType::Boolean
| ArrowDataType::Decimal32(_, _)
| ArrowDataType::Decimal64(_, _)
| ArrowDataType::Decimal128(_, _)
| ArrowDataType::Decimal256(_, _)
| ArrowDataType::Int8
| ArrowDataType::Int16
| ArrowDataType::Int32
| ArrowDataType::Int64
| ArrowDataType::UInt8
| ArrowDataType::UInt16
| ArrowDataType::UInt32
| ArrowDataType::UInt64
| ArrowDataType::Float32
| ArrowDataType::Float64
| ArrowDataType::Date32
| ArrowDataType::Date64
| ArrowDataType::Time32(_)
| ArrowDataType::Time64(_)
| ArrowDataType::Timestamp(_, _)
| ArrowDataType::Null
| ArrowDataType::Utf8
| ArrowDataType::Utf8View => true,
ArrowDataType::Dictionary(_, value_type) => value_type.as_ref() == &ArrowDataType::Utf8,
_ => false,
}
}
fn ensure_schema_compatible(from: &SchemaRef, to: &SchemaRef) -> Result<()> {
let not_match = from
.fields
@@ -780,4 +911,31 @@ mod tests {
assert_eq!(test.0.project(&fp).unwrap(), test.1.project(&tp).unwrap());
}
}
#[test]
fn test_csv_reader_schema_for_skip_bad_records() {
let file_schema = make_test_schema(&[
Field::new("id", DataType::Utf8, true),
Field::new("jsons", DataType::Utf8, true),
Field::new("ts", DataType::Utf8, true),
]);
let compat_schema = make_test_schema(&[
Field::new("id", DataType::UInt32, true),
Field::new("jsons", DataType::Binary, true),
Field::new(
"ts",
DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
true,
),
]);
let reader_schema = csv_reader_schema_for_skip_bad_records(&file_schema, &compat_schema);
assert_eq!(reader_schema.field(0).data_type(), &DataType::UInt32);
assert_eq!(reader_schema.field(1).data_type(), &DataType::Utf8);
assert_eq!(
reader_schema.field(2).data_type(),
compat_schema.field(2).data_type()
);
}
}

View File

@@ -233,6 +233,36 @@ transform:
parse(&Content::Yaml(pipeline_yaml)).unwrap()
}
fn prepare_vrl_pipeline() -> Pipeline {
let pipeline_yaml = r#"
---
description: Minimal VRL processor benchmark
processors:
- vrl:
source: |
.service_alias = .service
.host_alias = .host
del(.unused)
.processed = true
.
transform:
- field: service
type: string
- field: host
type: string
- field: service_alias
type: string
- field: host_alias
type: string
- field: processed
type: boolean
"#;
parse(&Content::Yaml(pipeline_yaml)).unwrap()
}
fn criterion_benchmark(c: &mut Criterion) {
let input_value_str = include_str!("./data.log");
let input_value = Deserializer::from_str(input_value_str)
@@ -262,6 +292,41 @@ fn criterion_benchmark(c: &mut Criterion) {
})
});
group.finish();
let vrl_input_value = (0..128)
.map(|i| {
serde_json::json!({
"service": "frontend",
"host": format!("host-{i}"),
"unused": "drop-me"
})
.into()
})
.collect::<Vec<VrlValue>>();
let vrl_pipeline = prepare_vrl_pipeline();
let (vrl_pipeline, mut vrl_schema_info, vrl_pipeline_def, vrl_pipeline_param) =
setup_pipeline!(vrl_pipeline);
let vrl_pipeline_ctx = PipelineContext::new(
&vrl_pipeline_def,
&vrl_pipeline_param,
session::context::Channel::Unknown,
);
let mut group = c.benchmark_group("vrl processor");
group.sample_size(50);
group.bench_function("processor mut", |b| {
b.iter(|| {
processor_mut(
black_box(vrl_pipeline.clone()),
black_box(&vrl_pipeline_ctx),
black_box(&mut vrl_schema_info),
black_box(vrl_input_value.clone()),
)
.unwrap();
})
});
group.finish();
}
// Testing the pipeline's performance in converting Json to Rows

View File

@@ -12,9 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cell::RefCell;
use std::collections::BTreeMap;
use chrono_tz::Tz;
use once_cell::sync::Lazy;
use snafu::{OptionExt, ensure};
use vrl::compiler::runtime::Runtime;
use vrl::compiler::{Program, TargetValue, compile};
@@ -31,6 +33,12 @@ use crate::etl::processor::yaml_string;
pub(crate) const PROCESSOR_VRL: &str = "vrl";
const SOURCE: &str = "source";
static UTC_TIMEZONE: Lazy<TimeZone> = Lazy::new(|| TimeZone::Named(Tz::UTC));
thread_local! {
static VRL_RUNTIME: RefCell<Runtime> = RefCell::new(Runtime::default());
}
#[derive(Debug)]
pub struct VrlProcessor {
source: String,
@@ -74,10 +82,14 @@ impl VrlProcessor {
secrets: Secrets::default(),
};
let timezone = TimeZone::Named(Tz::UTC);
let mut runtime = Runtime::default();
let re = runtime
.resolve(&mut target, &self.program, &timezone)
let re = VRL_RUNTIME
.with(|runtime| {
let mut runtime = runtime.borrow_mut();
runtime.clear();
let result = runtime.resolve(&mut target, &self.program, &UTC_TIMEZONE);
runtime.clear();
result
})
.map_err(|e| {
ExecuteVrlSnafu {
msg: e.get_expression_error().to_string(),

View File

@@ -14,25 +14,11 @@
use std::sync::Arc;
use axum::Router as HttpRouter;
use common_error::ext::BoxedError;
use tonic::transport::server::Router as GrpcRouter;
use crate::grpc::builder::GrpcServerBuilder;
/// A configurator that customizes or enhances an HTTP router.
#[async_trait::async_trait]
pub trait HttpConfigurator<C>: Send + Sync {
/// Configures the given HTTP router using the provided context.
async fn configure_http(
&self,
route: HttpRouter,
ctx: C,
) -> std::result::Result<HttpRouter, BoxedError>;
}
pub type HttpConfiguratorRef<C> = Arc<dyn HttpConfigurator<C>>;
/// A configurator that customizes or enhances a gRPC router.
#[async_trait::async_trait]
pub trait GrpcRouterConfigurator<C>: Send + Sync {

View File

@@ -24,7 +24,7 @@ pub mod prom_query_gateway;
pub mod region_server;
use std::any::Any;
use std::net::SocketAddr;
use std::net::{IpAddr, SocketAddr};
use std::time::Duration;
use api::v1::health_check_server::{HealthCheck, HealthCheckServer};
@@ -95,14 +95,8 @@ impl GrpcOptions {
if self.server_addr.is_empty() {
match local_ip_address::local_ip() {
Ok(ip) => {
let detected_addr = format!(
"{}:{}",
ip,
self.bind_addr
.split(':')
.nth(1)
.unwrap_or(DEFAULT_GRPC_ADDR_PORT)
);
let port = port_from_bind_addr(&self.bind_addr);
let detected_addr = format_server_addr(ip, port);
info!("Using detected: {} as server address", detected_addr);
self.server_addr = detected_addr;
}
@@ -131,7 +125,18 @@ impl GrpcOptions {
}
}
const DEFAULT_GRPC_ADDR_PORT: &str = "4001";
const DEFAULT_GRPC_ADDR_PORT: u16 = 4001;
fn port_from_bind_addr(bind_addr: &str) -> u16 {
bind_addr
.rsplit_once(':')
.and_then(|(_, port)| port.parse().ok())
.unwrap_or(DEFAULT_GRPC_ADDR_PORT)
}
fn format_server_addr(ip: IpAddr, port: u16) -> String {
SocketAddr::new(ip, port).to_string()
}
const DEFAULT_INTERNAL_GRPC_ADDR_PORT: &str = "4010";
@@ -415,3 +420,36 @@ impl Server for GrpcServer {
self
}
}
#[cfg(test)]
mod tests {
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use super::{DEFAULT_GRPC_ADDR_PORT, format_server_addr, port_from_bind_addr};
#[test]
fn test_port_from_bind_addr() {
assert_eq!(3002, port_from_bind_addr("127.0.0.1:3002"));
assert_eq!(3002, port_from_bind_addr("[::]:3002"));
assert_eq!(
3002,
port_from_bind_addr("greptimedb-metasrv.default.svc.cluster.local:3002")
);
assert_eq!(
DEFAULT_GRPC_ADDR_PORT,
port_from_bind_addr("invalid-bind-addr")
);
}
#[test]
fn test_format_server_addr() {
assert_eq!(
"127.0.0.1:3002",
format_server_addr(IpAddr::V4(Ipv4Addr::LOCALHOST), 3002)
);
assert_eq!(
"[::1]:3002",
format_server_addr(IpAddr::V6(Ipv6Addr::LOCALHOST), 3002)
);
}
}

View File

@@ -27,7 +27,6 @@ use axum::response::{IntoResponse, Response};
use axum::routing::Route;
use axum::serve::ListenerExt;
use axum::{Router, middleware, routing};
use common_base::Plugins;
use common_base::readable_size::ReadableSize;
use common_recordbatch::RecordBatch;
use common_telemetry::{error, info};
@@ -52,11 +51,9 @@ use tower_http::trace::TraceLayer;
use self::authorize::AuthState;
use self::result::table_result::TableResponse;
use crate::configurator::HttpConfiguratorRef;
use crate::elasticsearch;
use crate::error::{
AddressBindSnafu, AlreadyStartedSnafu, Error, InternalIoSnafu, InvalidHeaderValueSnafu,
OtherSnafu, Result,
AddressBindSnafu, AlreadyStartedSnafu, Error, InternalIoSnafu, InvalidHeaderValueSnafu, Result,
};
use crate::http::influxdb::{influxdb_health, influxdb_ping, influxdb_write_v1, influxdb_write_v2};
use crate::http::otlp::OtlpState;
@@ -139,9 +136,6 @@ pub struct HttpServer {
user_provider: Option<UserProviderRef>,
memory_limiter: ServerMemoryLimiter,
// plugins
plugins: Plugins,
// server configs
options: HttpOptions,
bind_addr: Option<SocketAddr>,
@@ -516,7 +510,6 @@ pub struct DashboardState {
pub struct HttpServerBuilder {
options: HttpOptions,
plugins: Plugins,
user_provider: Option<UserProviderRef>,
router: Router,
memory_limiter: ServerMemoryLimiter,
@@ -526,7 +519,6 @@ impl HttpServerBuilder {
pub fn new(options: HttpOptions) -> Self {
Self {
options,
plugins: Plugins::default(),
user_provider: None,
router: Router::new(),
memory_limiter: ServerMemoryLimiter::default(),
@@ -687,10 +679,6 @@ impl HttpServerBuilder {
Self { router, ..self }
}
pub fn with_plugins(self, plugins: Plugins) -> Self {
Self { plugins, ..self }
}
pub fn with_greptime_config_options(self, opts: String) -> Self {
let config_router = HttpServer::route_config(GreptimeOptionsConfigState {
greptime_config_options: opts,
@@ -748,7 +736,6 @@ impl HttpServerBuilder {
options: self.options,
user_provider: self.user_provider,
shutdown_tx: Mutex::new(None),
plugins: self.plugins,
router: StdMutex::new(self.router),
bind_addr: None,
memory_limiter: self.memory_limiter,
@@ -1237,14 +1224,7 @@ impl Server for HttpServer {
AlreadyStartedSnafu { server: "HTTP" }
);
let mut app = self.make_app();
if let Some(configurator) = self.plugins.get::<HttpConfiguratorRef<()>>() {
app = configurator
.configure_http(app, ())
.await
.context(OtherSnafu)?;
}
let app = self.build(app)?;
let app = self.build(self.make_app())?;
let listener = tokio::net::TcpListener::bind(listening)
.await
.context(AddressBindSnafu { addr: listening })?

View File

@@ -401,6 +401,28 @@ mod tests {
}
}
#[test]
fn test_parse_copy_table_from_csv_options() {
let sql =
"COPY my_table FROM '/tmp/test.csv' WITH (FORMAT = 'CSV', SKIP_BAD_RECORDS = 'false')";
let mut result =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
.unwrap();
assert_eq!(1, result.len());
let statement = result.remove(0);
assert_matches!(statement, Statement::Copy { .. });
match statement {
Statement::Copy(crate::statements::copy::Copy::CopyTable(CopyTable::From(
copy_table,
))) => {
assert_eq!(copy_table.with.get("format"), Some("CSV"));
assert_eq!(copy_table.with.get("skip_bad_records"), Some("false"));
}
_ => unreachable!(),
}
}
#[test]
fn test_parse_copy_table_to() {
struct Test<'a> {

View File

@@ -27,7 +27,7 @@ use serde::Serialize;
use snafu::ensure;
use sqlparser::ast::{
Array, Expr, Ident, ObjectName, ObjectNamePart, SetExpr, SqlOption, StructField, TableFactor,
Value, ValueWithSpan,
TableWithJoins, Value, ValueWithSpan,
};
use sqlparser_derive::{Visit, VisitMut};
@@ -195,7 +195,7 @@ pub fn extract_tables_from_query(query: &SqlOrTql) -> impl Iterator<Item = Objec
match query {
SqlOrTql::Sql(query, _) => {
extract_tables_from_set_expr(&query.inner.body, &mut names);
extract_tables_from_sql_query(&query.inner, &mut names);
extract_tables_from_hybrid_cte_query(query, &mut names);
}
SqlOrTql::Tql(tql, _) => extract_tables_from_tql(tql, &mut names),
@@ -205,26 +205,34 @@ pub fn extract_tables_from_query(query: &SqlOrTql) -> impl Iterator<Item = Objec
}
fn extract_tables_from_hybrid_cte_query(query: &Query, sql_names: &mut HashSet<ObjectName>) {
let mut tql_names = HashSet::new();
let mut cte_names: HashSet<String> = HashSet::new();
if let Some(hybrid_cte) = &query.hybrid_cte {
let mut cte_names: HashSet<String> = hybrid_cte
.cte_tables
.iter()
.map(|cte| ParserContext::canonicalize_identifier(cte.name.clone()).value)
.collect();
remove_cte_names(sql_names, &cte_names);
cte_names.clear();
for cte in &hybrid_cte.cte_tables {
cte_names.insert(ParserContext::canonicalize_identifier(cte.name.clone()).value);
if let CteContent::Tql(tql) = &cte.content {
extract_tables_from_tql(tql, &mut tql_names);
let cte_name = ParserContext::canonicalize_identifier(cte.name.clone()).value;
let mut cte_query_names = HashSet::new();
match &cte.content {
CteContent::Sql(cte_query) => {
extract_tables_from_sql_query(cte_query, &mut cte_query_names)
}
CteContent::Tql(tql) => extract_tables_from_tql(tql, &mut cte_query_names),
}
if hybrid_cte.recursive {
cte_names.insert(cte_name.clone());
}
remove_cte_names(&mut cte_query_names, &cte_names);
sql_names.extend(cte_query_names);
if !hybrid_cte.recursive {
cte_names.insert(cte_name);
}
}
}
if let Some(with) = &query.inner.with {
for cte in &with.cte_tables {
cte_names.insert(ParserContext::canonicalize_identifier(cte.alias.name.clone()).value);
}
}
remove_cte_names(sql_names, &cte_names);
sql_names.extend(tql_names);
}
fn remove_cte_names(names: &mut HashSet<ObjectName>, cte_names: &HashSet<String>) {
@@ -339,6 +347,33 @@ pub fn location_to_index(sql: &str, location: &sqlparser::tokenizer::Location) -
index - 1
}
/// Helper function for [extract_tables_from_query].
///
/// Handle [sqlparser::ast::Query].
fn extract_tables_from_sql_query(query: &sqlparser::ast::Query, names: &mut HashSet<ObjectName>) {
let mut cte_names = HashSet::new();
if let Some(with) = &query.with {
for cte in &with.cte_tables {
let cte_name = ParserContext::canonicalize_identifier(cte.alias.name.clone()).value;
let mut cte_query_names = HashSet::new();
extract_tables_from_sql_query(&cte.query, &mut cte_query_names);
if with.recursive {
cte_names.insert(cte_name.clone());
}
remove_cte_names(&mut cte_query_names, &cte_names);
names.extend(cte_query_names);
if !with.recursive {
cte_names.insert(cte_name);
}
}
}
let mut body_names = HashSet::new();
extract_tables_from_set_expr(&query.body, &mut body_names);
remove_cte_names(&mut body_names, &cte_names);
names.extend(body_names);
}
/// Helper function for [extract_tables_from_query].
///
/// Handle [SetExpr].
@@ -346,14 +381,11 @@ fn extract_tables_from_set_expr(set_expr: &SetExpr, names: &mut HashSet<ObjectNa
match set_expr {
SetExpr::Select(select) => {
for from in &select.from {
table_factor_to_object_name(&from.relation, names);
for join in &from.joins {
table_factor_to_object_name(&join.relation, names);
}
extract_tables_from_table_with_joins(from, names);
}
}
SetExpr::Query(query) => {
extract_tables_from_set_expr(&query.body, names);
extract_tables_from_sql_query(query, names);
}
SetExpr::SetOperation { left, right, .. } => {
extract_tables_from_set_expr(left, names);
@@ -363,12 +395,47 @@ fn extract_tables_from_set_expr(set_expr: &SetExpr, names: &mut HashSet<ObjectNa
};
}
/// Helper function for [extract_tables_from_query].
///
/// Handle [TableWithJoins].
fn extract_tables_from_table_with_joins(
table_with_joins: &TableWithJoins,
names: &mut HashSet<ObjectName>,
) {
table_factor_to_object_name(&table_with_joins.relation, names);
for join in &table_with_joins.joins {
table_factor_to_object_name(&join.relation, names);
}
}
/// Helper function for [extract_tables_from_query].
///
/// Handle [TableFactor].
fn table_factor_to_object_name(table_factor: &TableFactor, names: &mut HashSet<ObjectName>) {
if let TableFactor::Table { name, .. } = table_factor {
names.insert(name.to_owned());
match table_factor {
TableFactor::Table { name, .. } => {
names.insert(name.to_owned());
}
TableFactor::Derived { subquery, .. } => {
extract_tables_from_sql_query(subquery, names);
}
TableFactor::NestedJoin {
table_with_joins, ..
} => {
extract_tables_from_table_with_joins(table_with_joins, names);
}
TableFactor::Pivot { table, .. }
| TableFactor::Unpivot { table, .. }
| TableFactor::MatchRecognize { table, .. } => {
table_factor_to_object_name(table, names);
}
TableFactor::TableFunction { .. }
| TableFactor::Function { .. }
| TableFactor::UNNEST { .. }
| TableFactor::JsonTable { .. }
| TableFactor::OpenJsonTable { .. }
| TableFactor::XmlTable { .. }
| TableFactor::SemanticView { .. } => {}
}
}
@@ -458,6 +525,91 @@ TQL EVAL (now() - '15s'::interval, now(), '5s') count_values("status_code", {__n
}
}
#[test]
fn test_extract_tables_from_sql_query_with_derived_join() {
let sql = r#"
CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
EVAL INTERVAL '1m' AS
SELECT a.symbol, b.mark_price
FROM (
SELECT inst_id AS symbol, max(ts) AS mark_iv_ts
FROM flow_batch_join_opt_summary
GROUP BY inst_id
) a
LEFT JOIN (
SELECT symbol, max(mark_price) AS mark_price
FROM flow_batch_join_market_v5
WHERE "type" = 'OPTION_MARK'
GROUP BY symbol
) b ON a.symbol = b.symbol;
"#;
let mut stmts =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
.unwrap();
let Statement::CreateFlow(create_flow) = stmts.pop().unwrap() else {
unreachable!()
};
let mut tables = extract_tables_from_query(&create_flow.query)
.map(|table| format_raw_object_name(&table))
.collect_vec();
tables.sort();
assert_eq!(
vec![
"flow_batch_join_market_v5".to_string(),
"flow_batch_join_opt_summary".to_string(),
],
tables
);
}
#[test]
fn test_extract_tables_from_sql_query_with_cte_scopes() {
let testcases = vec![
(
r#"
WITH source AS (
SELECT * FROM source
)
SELECT * FROM source;
"#,
vec!["source".to_string()],
),
(
r#"
WITH first_cte AS (
SELECT * FROM physical_source
), second_cte AS (
SELECT * FROM first_cte
)
SELECT * FROM second_cte;
"#,
vec!["physical_source".to_string()],
),
];
for (sql, expected_tables) in testcases {
let mut stmts = ParserContext::create_with_dialect(
sql,
&GreptimeDbDialect {},
ParseOptions::default(),
)
.unwrap();
let Statement::Query(query) = stmts.pop().unwrap() else {
unreachable!()
};
let mut tables = HashSet::new();
extract_tables_from_sql_query(&query.inner, &mut tables);
let mut tables = tables
.into_iter()
.map(|table| format_raw_object_name(&table))
.collect_vec();
tables.sort();
assert_eq!(expected_tables, tables);
}
}
#[test]
fn test_extract_tables_from_tql_query_with_schema_matcher() {
let sql = r#"

View File

@@ -315,6 +315,7 @@ fn make_region_open(open: OpenRequest) -> Result<Vec<(RegionId, RegionRequest)>>
options: open.options,
skip_wal_replay: false,
checkpoint: None,
requirements: Default::default(),
}),
)])
}
@@ -566,6 +567,28 @@ pub struct RegionDropRequest {
pub partial_drop: bool,
}
/// Requirements for a region request.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(default)]
pub struct RegionRequirements {
/// Whether the region data must be backed by object storage.
pub object_storage: bool,
}
impl RegionRequirements {
/// Returns empty requirements.
pub fn empty() -> Self {
Self::default()
}
/// Returns requirements for object storage.
pub fn object_storage() -> Self {
Self {
object_storage: true,
}
}
}
/// Open region request.
#[derive(Debug, Clone)]
pub struct RegionOpenRequest {
@@ -581,6 +604,8 @@ pub struct RegionOpenRequest {
pub skip_wal_replay: bool,
/// Replay checkpoint.
pub checkpoint: Option<ReplayCheckpoint>,
/// Requirements for opening the region.
pub requirements: RegionRequirements,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]

View File

@@ -63,7 +63,7 @@ log-query = { workspace = true }
loki-proto.workspace = true
meta-client.workspace = true
meta-srv = { workspace = true, features = ["mock"] }
mito2.workspace = true
mito2 = { workspace = true, features = ["test-shared-fs-region-migration"] }
object-store.workspace = true
operator = { workspace = true, features = ["testing"] }
plugins.workspace = true

View File

@@ -183,6 +183,24 @@ select * from csv_null_prefix_import;
| final | 2023-11-14T22:13:23 |
+-------+---------------------+
CREATE TABLE csv_skip_bad_records(host_id int, host_name string, reading_value double, ts timestamp time index);
Affected Rows: 0
-- SQLNESS ENV PWD
Copy csv_skip_bad_records FROM '$PWD/tests/data/csv/skip_bad_records.csv' WITH (format='csv', skip_bad_records='true');
Affected Rows: 2
select * from csv_skip_bad_records order by ts;
+---------+-----------+---------------+---------------------+
| host_id | host_name | reading_value | ts |
+---------+-----------+---------------+---------------------+
| 1 | Alice | 10.5 | 2024-01-01T00:00:00 |
| 2 | Bob | 30.5 | 2024-01-01T00:00:02 |
+---------+-----------+---------------+---------------------+
drop table demo;
Affected Rows: 0
@@ -219,3 +237,7 @@ drop table csv_null_prefix_import;
Affected Rows: 0
drop table csv_skip_bad_records;
Affected Rows: 0

View File

@@ -73,6 +73,13 @@ Copy csv_null_prefix_import FROM '${SQLNESS_HOME}/demo/export/csv_null_prefix.cs
select * from csv_null_prefix_import;
CREATE TABLE csv_skip_bad_records(host_id int, host_name string, reading_value double, ts timestamp time index);
-- SQLNESS ENV PWD
Copy csv_skip_bad_records FROM '$PWD/tests/data/csv/skip_bad_records.csv' WITH (format='csv', skip_bad_records='true');
select * from csv_skip_bad_records order by ts;
drop table demo;
drop table with_filename;
@@ -90,3 +97,5 @@ drop table demo_with_less_columns;
drop table csv_null_prefix;
drop table csv_null_prefix_import;
drop table csv_skip_bad_records;

View File

@@ -0,0 +1,130 @@
CREATE DATABASE flow_join_fixture;
Affected Rows: 1
CREATE TABLE flow_join_fixture."left_samples" (
source_id STRING,
left_value DOUBLE,
event_ts TIMESTAMP,
observed_at TIMESTAMP TIME INDEX
);
Affected Rows: 0
CREATE TABLE flow_join_fixture."right_samples" (
source_id STRING,
right_value DOUBLE,
sample_kind STRING,
event_ts TIMESTAMP,
observed_at TIMESTAMP TIME INDEX
);
Affected Rows: 0
-- Verify batching flow creation accepts aggregate subqueries joined by LEFT JOIN.
CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
EVAL INTERVAL '5m' AS
SELECT
l.source_id,
l.measure_name,
l.bucket_time,
l.left_event_ts,
l.left_value,
r.right_event_ts,
r.right_value
FROM (
SELECT
source_id,
'sample' AS measure_name,
date_trunc('minute', now()) AS bucket_time,
max(event_ts) AS left_event_ts,
last_value(left_value ORDER BY observed_at) AS left_value
FROM
flow_join_fixture."left_samples"
WHERE
observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
AND date_trunc('minute', now())
GROUP BY
source_id
) l
LEFT JOIN (
SELECT
source_id,
'sample' AS measure_name,
date_trunc('minute', now()) AS bucket_time,
max(event_ts) AS right_event_ts,
last_value(right_value ORDER BY observed_at) AS right_value
FROM
flow_join_fixture."right_samples"
WHERE
observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
AND date_trunc('minute', now())
AND sample_kind = 'primary'
GROUP BY
source_id
) r ON l.source_id = r.source_id AND l.bucket_time = r.bucket_time;
Affected Rows: 0
SELECT
source_table_names LIKE '%left_samples%' AS has_left_source,
source_table_names LIKE '%right_samples%' AS has_right_source,
options LIKE '%"flow_type":"batching"%' AS is_batching_flow
FROM
INFORMATION_SCHEMA.FLOWS
WHERE
flow_name = 'flow_batch_join_subquery';
+-----------------+------------------+------------------+
| has_left_source | has_right_source | is_batching_flow |
+-----------------+------------------+------------------+
| true | true | true |
+-----------------+------------------+------------------+
INSERT INTO flow_join_fixture."left_samples" VALUES
('source-a', 0.12, date_trunc('minute', now()), date_trunc('minute', now()));
Affected Rows: 1
INSERT INTO flow_join_fixture."right_samples" VALUES
('source-a', 100.5, 'primary', date_trunc('minute', now()), date_trunc('minute', now()));
Affected Rows: 1
-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED |
ADMIN FLUSH_FLOW('flow_batch_join_subquery');
+----------------------------------------------+
| ADMIN FLUSH_FLOW('flow_batch_join_subquery') |
+----------------------------------------------+
| FLOW_FLUSHED |
+----------------------------------------------+
SELECT source_id, measure_name, left_value, right_value FROM flow_batch_join_sink ORDER BY source_id;
+-----------+--------------+------------+-------------+
| source_id | measure_name | left_value | right_value |
+-----------+--------------+------------+-------------+
| source-a | sample | 0.12 | 100.5 |
+-----------+--------------+------------+-------------+
DROP FLOW flow_batch_join_subquery;
Affected Rows: 0
DROP TABLE flow_batch_join_sink;
Affected Rows: 0
DROP TABLE flow_join_fixture."left_samples";
Affected Rows: 0
DROP TABLE flow_join_fixture."right_samples";
Affected Rows: 0
DROP DATABASE flow_join_fixture;
Affected Rows: 0

View File

@@ -0,0 +1,85 @@
CREATE DATABASE flow_join_fixture;
CREATE TABLE flow_join_fixture."left_samples" (
source_id STRING,
left_value DOUBLE,
event_ts TIMESTAMP,
observed_at TIMESTAMP TIME INDEX
);
CREATE TABLE flow_join_fixture."right_samples" (
source_id STRING,
right_value DOUBLE,
sample_kind STRING,
event_ts TIMESTAMP,
observed_at TIMESTAMP TIME INDEX
);
-- Verify batching flow creation accepts aggregate subqueries joined by LEFT JOIN.
CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
EVAL INTERVAL '5m' AS
SELECT
l.source_id,
l.measure_name,
l.bucket_time,
l.left_event_ts,
l.left_value,
r.right_event_ts,
r.right_value
FROM (
SELECT
source_id,
'sample' AS measure_name,
date_trunc('minute', now()) AS bucket_time,
max(event_ts) AS left_event_ts,
last_value(left_value ORDER BY observed_at) AS left_value
FROM
flow_join_fixture."left_samples"
WHERE
observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
AND date_trunc('minute', now())
GROUP BY
source_id
) l
LEFT JOIN (
SELECT
source_id,
'sample' AS measure_name,
date_trunc('minute', now()) AS bucket_time,
max(event_ts) AS right_event_ts,
last_value(right_value ORDER BY observed_at) AS right_value
FROM
flow_join_fixture."right_samples"
WHERE
observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
AND date_trunc('minute', now())
AND sample_kind = 'primary'
GROUP BY
source_id
) r ON l.source_id = r.source_id AND l.bucket_time = r.bucket_time;
SELECT
source_table_names LIKE '%left_samples%' AS has_left_source,
source_table_names LIKE '%right_samples%' AS has_right_source,
options LIKE '%"flow_type":"batching"%' AS is_batching_flow
FROM
INFORMATION_SCHEMA.FLOWS
WHERE
flow_name = 'flow_batch_join_subquery';
INSERT INTO flow_join_fixture."left_samples" VALUES
('source-a', 0.12, date_trunc('minute', now()), date_trunc('minute', now()));
INSERT INTO flow_join_fixture."right_samples" VALUES
('source-a', 100.5, 'primary', date_trunc('minute', now()), date_trunc('minute', now()));
-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED |
ADMIN FLUSH_FLOW('flow_batch_join_subquery');
SELECT source_id, measure_name, left_value, right_value FROM flow_batch_join_sink ORDER BY source_id;
DROP FLOW flow_batch_join_subquery;
DROP TABLE flow_batch_join_sink;
DROP TABLE flow_join_fixture."left_samples";
DROP TABLE flow_join_fixture."right_samples";
DROP DATABASE flow_join_fixture;

View File

@@ -162,6 +162,8 @@ CREATE TABLE approx_rate (
Affected Rows: 0
-- Without merge_mode=last_non_null, this partial output is rejected at CREATE FLOW time.
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW find_approx_rate SINK TO approx_rate AS
SELECT
(max(byte) - min(byte)) / 30.0 as rate,
@@ -172,24 +174,7 @@ from
GROUP BY
time_window;
Affected Rows: 0
INSERT INTO
bytes_log
VALUES
(NULL, '2023-01-01 00:00:01'),
(300, '2023-01-01 00:00:31');
Affected Rows: 2
-- should return error
ADMIN FLUSH_FLOW('find_approx_rate');
Error: 1002(Unexpected), Failed to execute admin function flush_flow: Execution error: Internal error: 1003
DROP FLOW find_approx_rate;
Affected Rows: 0
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 3 flow output columns and 4 sink table columns. flow output columns: [\"rate\", \"time_window\", \"update_at\"], sink table columns: [\"rate\", \"time_window\", \"update_at\", \"bb\"], extra flow columns not in sink: [], missing sink columns from flow output: [\"bb\"]") in context: Failed to rewrite plan
DROP TABLE bytes_log;

View File

@@ -84,6 +84,8 @@ CREATE TABLE approx_rate (
TIME INDEX(time_window)
);
-- Without merge_mode=last_non_null, this partial output is rejected at CREATE FLOW time.
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW find_approx_rate SINK TO approx_rate AS
SELECT
(max(byte) - min(byte)) / 30.0 as rate,
@@ -93,16 +95,5 @@ from
bytes_log
GROUP BY
time_window;
INSERT INTO
bytes_log
VALUES
(NULL, '2023-01-01 00:00:01'),
(300, '2023-01-01 00:00:31');
-- should return error
ADMIN FLUSH_FLOW('find_approx_rate');
DROP FLOW find_approx_rate;
DROP TABLE bytes_log;
DROP TABLE approx_rate;

View File

@@ -0,0 +1,123 @@
-- Verify that batching flow rejects CREATE FLOW when the pre-existing sink
-- table schema does not match the flow output (create-time validation, not runtime).
CREATE TABLE source_mm (
"number" INT,
extra STRING,
ts TIMESTAMP TIME INDEX
);
Affected Rows: 0
-- Pre-create a sink table that is intentionally missing the "extra" column.
-- This case validates batching mode at CREATE FLOW time, before any INSERT/FLUSH.
CREATE TABLE sink_mm (
"number" INT,
time_window TIMESTAMP TIME INDEX,
cnt BIGINT
);
Affected Rows: 0
-- This CREATE FLOW should fail immediately: the flow outputs (number, extra, time_window, cnt)
-- but sink_mm has only (number, time_window, cnt).
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW mismatch_flow SINK TO sink_mm AS
SELECT
"number",
extra,
date_bin(INTERVAL '1 second', ts) as time_window,
count(*) as cnt
FROM
source_mm
GROUP BY
"number", extra, time_window;
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 4 flow output columns and 3 sink table columns. flow output columns: [\"number\", \"extra\", \"time_window\", \"cnt\"], sink table columns: [\"number\", \"time_window\", \"cnt\"], extra flow columns not in sink: [\"extra\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
DROP TABLE source_mm;
Affected Rows: 0
DROP TABLE sink_mm;
Affected Rows: 0
-- TQL/PromQL flows use the same create-time sink schema validation path.
CREATE TABLE tql_source_mm (
`value` DOUBLE,
ts TIMESTAMP TIME INDEX,
sensor STRING,
loc STRING,
PRIMARY KEY (sensor, loc)
);
Affected Rows: 0
-- Pre-create a TQL sink table that is intentionally missing the "sensor" tag column.
CREATE TABLE tql_sink_mm (
`value` DOUBLE,
ts TIMESTAMP TIME INDEX
);
Affected Rows: 0
-- This CREATE FLOW should fail immediately: the TQL output has (value, sensor, ts),
-- but tql_sink_mm has only (value, ts).
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW tql_mismatch_flow
SINK TO tql_sink_mm
EVAL INTERVAL '1m' AS
TQL EVAL (now() - '1m'::interval, now(), '1m')
avg by(sensor) (tql_source_mm) AS value;
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 3 flow output columns and 2 sink table columns. flow output columns: [\"value\", \"sensor\", \"ts\"], sink table columns: [\"value\", \"ts\"], extra flow columns not in sink: [\"sensor\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
DROP TABLE tql_source_mm;
Affected Rows: 0
DROP TABLE tql_sink_mm;
Affected Rows: 0
-- Real merge_mode=last_non_null sink options should enable partial schema validation.
CREATE TABLE lnn_source_mm (
device STRING,
val DOUBLE,
ts TIMESTAMP TIME INDEX
);
Affected Rows: 0
CREATE TABLE lnn_sink_mm (
device STRING,
time_window TIMESTAMP TIME INDEX,
cnt BIGINT,
PRIMARY KEY (device)
) WITH('merge_mode'='last_non_null');
Affected Rows: 0
-- This CREATE FLOW should fail through the last_non_null partial validator: the
-- sink primary key "device" is required but absent from the flow output.
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW lnn_missing_pk_flow
SINK TO lnn_sink_mm AS
SELECT
date_bin(INTERVAL '1 second', ts) as time_window,
count(*) as cnt
FROM
lnn_source_mm
GROUP BY
time_window;
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Column(s) [\"device\"] required by sink table are missing from flow output when merge_mode=last_non_null. Flow output schema does not match sink table schema: found 2 flow output columns and 3 sink table columns. flow output columns: [\"time_window\", \"cnt\"], sink table columns: [\"device\", \"time_window\", \"cnt\"], extra flow columns not in sink: [], missing sink columns from flow output: [\"device\"]") in context: Failed to rewrite plan
DROP TABLE lnn_source_mm;
Affected Rows: 0
DROP TABLE lnn_sink_mm;
Affected Rows: 0

View File

@@ -0,0 +1,89 @@
-- Verify that batching flow rejects CREATE FLOW when the pre-existing sink
-- table schema does not match the flow output (create-time validation, not runtime).
CREATE TABLE source_mm (
"number" INT,
extra STRING,
ts TIMESTAMP TIME INDEX
);
-- Pre-create a sink table that is intentionally missing the "extra" column.
-- This case validates batching mode at CREATE FLOW time, before any INSERT/FLUSH.
CREATE TABLE sink_mm (
"number" INT,
time_window TIMESTAMP TIME INDEX,
cnt BIGINT
);
-- This CREATE FLOW should fail immediately: the flow outputs (number, extra, time_window, cnt)
-- but sink_mm has only (number, time_window, cnt).
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW mismatch_flow SINK TO sink_mm AS
SELECT
"number",
extra,
date_bin(INTERVAL '1 second', ts) as time_window,
count(*) as cnt
FROM
source_mm
GROUP BY
"number", extra, time_window;
DROP TABLE source_mm;
DROP TABLE sink_mm;
-- TQL/PromQL flows use the same create-time sink schema validation path.
CREATE TABLE tql_source_mm (
`value` DOUBLE,
ts TIMESTAMP TIME INDEX,
sensor STRING,
loc STRING,
PRIMARY KEY (sensor, loc)
);
-- Pre-create a TQL sink table that is intentionally missing the "sensor" tag column.
CREATE TABLE tql_sink_mm (
`value` DOUBLE,
ts TIMESTAMP TIME INDEX
);
-- This CREATE FLOW should fail immediately: the TQL output has (value, sensor, ts),
-- but tql_sink_mm has only (value, ts).
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW tql_mismatch_flow
SINK TO tql_sink_mm
EVAL INTERVAL '1m' AS
TQL EVAL (now() - '1m'::interval, now(), '1m')
avg by(sensor) (tql_source_mm) AS value;
DROP TABLE tql_source_mm;
DROP TABLE tql_sink_mm;
-- Real merge_mode=last_non_null sink options should enable partial schema validation.
CREATE TABLE lnn_source_mm (
device STRING,
val DOUBLE,
ts TIMESTAMP TIME INDEX
);
CREATE TABLE lnn_sink_mm (
device STRING,
time_window TIMESTAMP TIME INDEX,
cnt BIGINT,
PRIMARY KEY (device)
) WITH('merge_mode'='last_non_null');
-- This CREATE FLOW should fail through the last_non_null partial validator: the
-- sink primary key "device" is required but absent from the flow output.
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW lnn_missing_pk_flow
SINK TO lnn_sink_mm AS
SELECT
date_bin(INTERVAL '1 second', ts) as time_window,
count(*) as cnt
FROM
lnn_source_mm
GROUP BY
time_window;
DROP TABLE lnn_source_mm;
DROP TABLE lnn_sink_mm;

View File

@@ -0,0 +1,90 @@
-- Regression for a TQL flow whose pre-created sink table is missing the value
-- output column. The labels are intentionally minimal and anonymous.
CREATE DATABASE source_schema;
Affected Rows: 1
CREATE DATABASE sink_schema;
Affected Rows: 1
USE source_schema;
Affected Rows: 0
CREATE TABLE metric_input (
namespace STRING NULL,
app STRING NULL,
greptime_timestamp TIMESTAMP(3) NOT NULL,
greptime_value DOUBLE NULL,
TIME INDEX (greptime_timestamp),
PRIMARY KEY (namespace, app)
);
Affected Rows: 0
INSERT INTO metric_input VALUES
('ns', 'app-a', '2026-01-23T03:40:00Z', 10.0),
('ns', 'app-a', '2026-01-23T03:50:00Z', 20.0);
Affected Rows: 2
USE sink_schema;
Affected Rows: 0
-- Intentionally omit greptime_value DOUBLE from the pre-created sink table.
CREATE TABLE missing_value_sink (
namespace STRING NULL,
app STRING NULL,
greptime_timestamp TIMESTAMP(3) NOT NULL,
TIME INDEX (greptime_timestamp),
PRIMARY KEY (namespace, app)
)
ENGINE=mito;
Affected Rows: 0
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW missing_value_flow
SINK TO sink_schema.missing_value_sink
EVAL INTERVAL '3600 s'
AS TQL EVAL (
date_bin('2m'::interval, now() - '2m'::interval),
date_bin('2m'::interval, now() - '2m'::interval),
'1h'
)
avg by (namespace, app) (
avg_over_time(metric_input{__schema__="source_schema"}[1h])
);
Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 4 flow output columns and 3 sink table columns. flow output columns: [\"namespace\", \"app\", \"greptime_timestamp\", \"avg(prom_avg_over_time(greptime_timestamp_range,greptime_value))\"], sink table columns: [\"namespace\", \"app\", \"greptime_timestamp\"], extra flow columns not in sink: [\"avg(prom_avg_over_time(greptime_timestamp_range,greptime_value))\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
DROP FLOW IF EXISTS missing_value_flow;
Affected Rows: 0
DROP TABLE missing_value_sink;
Affected Rows: 0
USE source_schema;
Affected Rows: 0
DROP TABLE metric_input;
Affected Rows: 0
USE public;
Affected Rows: 0
DROP DATABASE sink_schema;
Affected Rows: 0
DROP DATABASE source_schema;
Affected Rows: 0

View File

@@ -0,0 +1,55 @@
-- Regression for a TQL flow whose pre-created sink table is missing the value
-- output column. The labels are intentionally minimal and anonymous.
CREATE DATABASE source_schema;
CREATE DATABASE sink_schema;
USE source_schema;
CREATE TABLE metric_input (
namespace STRING NULL,
app STRING NULL,
greptime_timestamp TIMESTAMP(3) NOT NULL,
greptime_value DOUBLE NULL,
TIME INDEX (greptime_timestamp),
PRIMARY KEY (namespace, app)
);
INSERT INTO metric_input VALUES
('ns', 'app-a', '2026-01-23T03:40:00Z', 10.0),
('ns', 'app-a', '2026-01-23T03:50:00Z', 20.0);
USE sink_schema;
-- Intentionally omit greptime_value DOUBLE from the pre-created sink table.
CREATE TABLE missing_value_sink (
namespace STRING NULL,
app STRING NULL,
greptime_timestamp TIMESTAMP(3) NOT NULL,
TIME INDEX (greptime_timestamp),
PRIMARY KEY (namespace, app)
)
ENGINE=mito;
-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
CREATE FLOW missing_value_flow
SINK TO sink_schema.missing_value_sink
EVAL INTERVAL '3600 s'
AS TQL EVAL (
date_bin('2m'::interval, now() - '2m'::interval),
date_bin('2m'::interval, now() - '2m'::interval),
'1h'
)
avg by (namespace, app) (
avg_over_time(metric_input{__schema__="source_schema"}[1h])
);
DROP FLOW IF EXISTS missing_value_flow;
DROP TABLE missing_value_sink;
USE source_schema;
DROP TABLE metric_input;
USE public;
DROP DATABASE sink_schema;
DROP DATABASE source_schema;

View File

@@ -0,0 +1,4 @@
host_id,host_name,reading_value,ts
1,Alice,10.5,2024-01-01T00:00:00
bad,Bad,20.0,2024-01-01T00:00:01
2,Bob,30.5,2024-01-01T00:00:02
1 host_id host_name reading_value ts
2 1 Alice 10.5 2024-01-01T00:00:00
3 bad Bad 20.0 2024-01-01T00:00:01
4 2 Bob 30.5 2024-01-01T00:00:02