From 8513d8d6b6cccf5a5202173e15ff01ac21d7d205 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Wed, 24 Jun 2026 17:35:25 +0800 Subject: [PATCH] feat: add flow batching metrics to grafana dashboard (#8353) Signed-off-by: evenyag --- .../dashboards/metrics/cluster/dashboard.json | 777 ++++++++++++++++++ .../dashboards/metrics/cluster/dashboard.md | 9 +- .../dashboards/metrics/cluster/dashboard.yaml | 105 +++ .../metrics/standalone/dashboard.json | 777 ++++++++++++++++++ .../metrics/standalone/dashboard.md | 9 +- .../metrics/standalone/dashboard.yaml | 105 +++ 6 files changed, 1780 insertions(+), 2 deletions(-) diff --git a/grafana/dashboards/metrics/cluster/dashboard.json b/grafana/dashboards/metrics/cluster/dashboard.json index 507d9312e6..bd22a0da59 100644 --- a/grafana/dashboards/metrics/cluster/dashboard.json +++ b/grafana/dashboards/metrics/cluster/dashboard.json @@ -3636,6 +3636,19 @@ "legendFormat": "trigger-save-alert", "range": true, "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum(rate(greptime_flow_batching_error_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "flow-batching", + "range": true, + "refId": "E" } ], "title": "Flow and Trigger Failures", @@ -17027,6 +17040,770 @@ ], "title": "Flow Processing Error per Instance", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode task attempt, error, and slow-query rates by flow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 178 + }, + "id": 558, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow", + "range": true, + "refId": "C" + } + ], + "title": "Flow Batching Attempt / Error Rate", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode query latency by flow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 186 + }, + "id": 559, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg", + "range": true, + "refId": "C" + } + ], + "title": "Flow Batching Query Latency", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode query window count by flow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 186 + }, + "id": 560, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg", + "range": true, + "refId": "B" + } + ], + "title": "Flow Batching Query Window Count", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode query and stalled window sizes by flow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 186 + }, + "id": 561, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95", + "range": true, + "refId": "B" + } + ], + "title": "Flow Batching Window Size", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode dirty time-window count marked by bulk inserts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 194 + }, + "id": 562, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]", + "range": true, + "refId": "A" + } + ], + "title": "Flow Batching Bulk Dirty Windows", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode checkpoint state-machine decision rate.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 194 + }, + "id": 563, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]", + "range": true, + "refId": "A" + } + ], + "title": "Flow Batching Checkpoint Decisions", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode query attempts by checkpoint mode.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 194 + }, + "id": 564, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]", + "range": true, + "refId": "A" + } + ], + "title": "Flow Batching Query Mode", + "type": "timeseries", + "pluginVersion": "11.6.0" } ], "title": "Flownode", diff --git a/grafana/dashboards/metrics/cluster/dashboard.md b/grafana/dashboards/metrics/cluster/dashboard.md index fd7ae68fd4..b8e050114c 100644 --- a/grafana/dashboards/metrics/cluster/dashboard.md +++ b/grafana/dashboards/metrics/cluster/dashboard.md @@ -36,7 +36,7 @@ | Scan and Compaction Memory Rejects | `sum(rate(greptime_mito_scan_requests_rejected_total{instance=~"$datanode"}[$__rate_interval]))`
`sum(rate(greptime_mito_scan_memory_exhausted_total{instance=~"$datanode"}[$__rate_interval]))`
`sum(rate(greptime_mito_compaction_memory_rejected_total{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | Datanode scan and compaction memory rejection/exhaustion counters. | `prometheus` | `rps` | `scan-rejected` | | OpenDAL Errors | `sum by (scheme, operation, error) (rate(opendal_operation_errors_total{instance=~"$datanode",error!="NotFound"}[$__rate_interval]))` | `timeseries` | Object-store errors by scheme, operation, and error, excluding NotFound noise. | `prometheus` | `eps` | `{{scheme}}-{{operation}}-{{error}}` | | Metasrv Failures | `sum(rate(greptime_meta_region_migration_fail[$__rate_interval]))`
`sum(rate(greptime_meta_reconciliation_procedure_error[$__rate_interval]))` | `timeseries` | Region migration and reconciliation failures in metasrv. | `prometheus` | `eps` | `migration-fail` | -| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`
`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` | +| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`
`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))`
`sum(rate(greptime_flow_batching_error_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` | | Mito GC Failures | `sum(rate(greptime_mito_gc_errors_total{instance=~"$datanode"}[$__rate_interval]))`
`sum(rate(greptime_mito_gc_orphaned_index_files{instance=~"$datanode"}[$__rate_interval]))`
`sum(rate(greptime_mito_gc_skipped_unparsable_files{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | Mito garbage-collection errors and skipped/orphaned files on datanodes. | `prometheus` | `short` | `gc-errors` | # Capacity | Title | Query | Type | Description | Datasource | Unit | Legend Format | @@ -282,6 +282,13 @@ ORDER BY data_size DESC;` | `piechart` | Distribution of leader regions and data | Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`
`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` | | Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` | | Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` | +| Flow Batching Attempt / Error Rate | `sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode task attempt, error, and slow-query rates by flow. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt` | +| Flow Batching Query Latency | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`
`histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode query latency by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` | +| Flow Batching Query Window Count | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))` | `timeseries` | Flow batching mode query window count by flow. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` | +| Flow Batching Window Size | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))`
`histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))` | `timeseries` | Flow batching mode query and stalled window sizes by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95` | +| Flow Batching Bulk Dirty Windows | `sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)` | `timeseries` | Flow batching mode dirty time-window count marked by bulk inserts. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]` | +| Flow Batching Checkpoint Decisions | `sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))` | `timeseries` | Flow batching mode checkpoint state-machine decision rate. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]` | +| Flow Batching Query Mode | `sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))` | `timeseries` | Flow batching mode query attempts by checkpoint mode. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]` | # Trigger | Title | Query | Type | Description | Datasource | Unit | Legend Format | | --- | --- | --- | --- | --- | --- | --- | diff --git a/grafana/dashboards/metrics/cluster/dashboard.yaml b/grafana/dashboards/metrics/cluster/dashboard.yaml index 127a8d606b..cd105edfcd 100644 --- a/grafana/dashboards/metrics/cluster/dashboard.yaml +++ b/grafana/dashboards/metrics/cluster/dashboard.yaml @@ -535,6 +535,11 @@ groups: type: prometheus uid: ${metrics} legendFormat: trigger-save-alert + - expr: sum(rate(greptime_flow_batching_error_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: flow-batching - title: Mito GC Failures type: timeseries description: Mito garbage-collection errors and skipped/orphaned files on datanodes. @@ -2245,6 +2250,106 @@ groups: type: prometheus uid: ${metrics} legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]' + - title: Flow Batching Attempt / Error Rate + type: timeseries + description: Flow batching mode task attempt, error, and slow-query rates by flow. + unit: ops + queries: + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt' + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error' + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow' + - title: Flow Batching Query Latency + type: timeseries + description: Flow batching mode query latency by flow. + unit: s + queries: + - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95' + - expr: histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99' + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg' + - title: Flow Batching Query Window Count + type: timeseries + description: Flow batching mode query window count by flow. + unit: short + queries: + - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95' + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg' + - title: Flow Batching Window Size + type: timeseries + description: Flow batching mode query and stalled window sizes by flow. + unit: s + queries: + - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95' + - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95' + - title: Flow Batching Bulk Dirty Windows + type: timeseries + description: Flow batching mode dirty time-window count marked by bulk inserts. + unit: short + queries: + - expr: sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]' + - title: Flow Batching Checkpoint Decisions + type: timeseries + description: Flow batching mode checkpoint state-machine decision rate. + unit: ops + queries: + - expr: sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]' + - title: Flow Batching Query Mode + type: timeseries + description: Flow batching mode query attempts by checkpoint mode. + unit: ops + queries: + - expr: sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]' - title: Trigger panels: - title: Trigger Count diff --git a/grafana/dashboards/metrics/standalone/dashboard.json b/grafana/dashboards/metrics/standalone/dashboard.json index 376e1665cd..a90d05760e 100644 --- a/grafana/dashboards/metrics/standalone/dashboard.json +++ b/grafana/dashboards/metrics/standalone/dashboard.json @@ -3636,6 +3636,19 @@ "legendFormat": "trigger-save-alert", "range": true, "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum(rate(greptime_flow_batching_error_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "flow-batching", + "range": true, + "refId": "E" } ], "title": "Flow and Trigger Failures", @@ -17027,6 +17040,770 @@ ], "title": "Flow Processing Error per Instance", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode task attempt, error, and slow-query rates by flow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 178 + }, + "id": 558, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow", + "range": true, + "refId": "C" + } + ], + "title": "Flow Batching Attempt / Error Rate", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode query latency by flow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 186 + }, + "id": 559, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg", + "range": true, + "refId": "C" + } + ], + "title": "Flow Batching Query Latency", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode query window count by flow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 186 + }, + "id": 560, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg", + "range": true, + "refId": "B" + } + ], + "title": "Flow Batching Query Window Count", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode query and stalled window sizes by flow.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 186 + }, + "id": 561, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95", + "range": true, + "refId": "B" + } + ], + "title": "Flow Batching Window Size", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode dirty time-window count marked by bulk inserts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 194 + }, + "id": 562, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]", + "range": true, + "refId": "A" + } + ], + "title": "Flow Batching Bulk Dirty Windows", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode checkpoint state-machine decision rate.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 194 + }, + "id": 563, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]", + "range": true, + "refId": "A" + } + ], + "title": "Flow Batching Checkpoint Decisions", + "type": "timeseries", + "pluginVersion": "11.6.0" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Flow batching mode query attempts by checkpoint mode.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 194 + }, + "id": 564, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]", + "range": true, + "refId": "A" + } + ], + "title": "Flow Batching Query Mode", + "type": "timeseries", + "pluginVersion": "11.6.0" } ], "title": "Flownode", diff --git a/grafana/dashboards/metrics/standalone/dashboard.md b/grafana/dashboards/metrics/standalone/dashboard.md index a3a894cffa..2ed84c5921 100644 --- a/grafana/dashboards/metrics/standalone/dashboard.md +++ b/grafana/dashboards/metrics/standalone/dashboard.md @@ -36,7 +36,7 @@ | Scan and Compaction Memory Rejects | `sum(rate(greptime_mito_scan_requests_rejected_total[$__rate_interval]))`
`sum(rate(greptime_mito_scan_memory_exhausted_total[$__rate_interval]))`
`sum(rate(greptime_mito_compaction_memory_rejected_total[$__rate_interval]))` | `timeseries` | Datanode scan and compaction memory rejection/exhaustion counters. | `prometheus` | `rps` | `scan-rejected` | | OpenDAL Errors | `sum by (scheme, operation, error) (rate(opendal_operation_errors_total{error!="NotFound"}[$__rate_interval]))` | `timeseries` | Object-store errors by scheme, operation, and error, excluding NotFound noise. | `prometheus` | `eps` | `{{scheme}}-{{operation}}-{{error}}` | | Metasrv Failures | `sum(rate(greptime_meta_region_migration_fail[$__rate_interval]))`
`sum(rate(greptime_meta_reconciliation_procedure_error[$__rate_interval]))` | `timeseries` | Region migration and reconciliation failures in metasrv. | `prometheus` | `eps` | `migration-fail` | -| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`
`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` | +| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`
`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))`
`sum(rate(greptime_flow_batching_error_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` | | Mito GC Failures | `sum(rate(greptime_mito_gc_errors_total[$__rate_interval]))`
`sum(rate(greptime_mito_gc_orphaned_index_files[$__rate_interval]))`
`sum(rate(greptime_mito_gc_skipped_unparsable_files[$__rate_interval]))` | `timeseries` | Mito garbage-collection errors and skipped/orphaned files on datanodes. | `prometheus` | `short` | `gc-errors` | # Capacity | Title | Query | Type | Description | Datasource | Unit | Legend Format | @@ -282,6 +282,13 @@ ORDER BY data_size DESC;` | `piechart` | Distribution of leader regions and data | Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`
`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` | | Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` | | Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` | +| Flow Batching Attempt / Error Rate | `sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode task attempt, error, and slow-query rates by flow. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt` | +| Flow Batching Query Latency | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`
`histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode query latency by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` | +| Flow Batching Query Window Count | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))` | `timeseries` | Flow batching mode query window count by flow. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` | +| Flow Batching Window Size | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))`
`histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))` | `timeseries` | Flow batching mode query and stalled window sizes by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95` | +| Flow Batching Bulk Dirty Windows | `sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)` | `timeseries` | Flow batching mode dirty time-window count marked by bulk inserts. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]` | +| Flow Batching Checkpoint Decisions | `sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))` | `timeseries` | Flow batching mode checkpoint state-machine decision rate. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]` | +| Flow Batching Query Mode | `sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))` | `timeseries` | Flow batching mode query attempts by checkpoint mode. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]` | # Trigger | Title | Query | Type | Description | Datasource | Unit | Legend Format | | --- | --- | --- | --- | --- | --- | --- | diff --git a/grafana/dashboards/metrics/standalone/dashboard.yaml b/grafana/dashboards/metrics/standalone/dashboard.yaml index 0ebad934ba..8da2a8fa6f 100644 --- a/grafana/dashboards/metrics/standalone/dashboard.yaml +++ b/grafana/dashboards/metrics/standalone/dashboard.yaml @@ -535,6 +535,11 @@ groups: type: prometheus uid: ${metrics} legendFormat: trigger-save-alert + - expr: sum(rate(greptime_flow_batching_error_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: flow-batching - title: Mito GC Failures type: timeseries description: Mito garbage-collection errors and skipped/orphaned files on datanodes. @@ -2245,6 +2250,106 @@ groups: type: prometheus uid: ${metrics} legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]' + - title: Flow Batching Attempt / Error Rate + type: timeseries + description: Flow batching mode task attempt, error, and slow-query rates by flow. + unit: ops + queries: + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt' + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error' + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow' + - title: Flow Batching Query Latency + type: timeseries + description: Flow batching mode query latency by flow. + unit: s + queries: + - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95' + - expr: histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99' + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg' + - title: Flow Batching Query Window Count + type: timeseries + description: Flow batching mode query window count by flow. + unit: short + queries: + - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95' + - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg' + - title: Flow Batching Window Size + type: timeseries + description: Flow batching mode query and stalled window sizes by flow. + unit: s + queries: + - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95' + - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval]))) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95' + - title: Flow Batching Bulk Dirty Windows + type: timeseries + description: Flow batching mode dirty time-window count marked by bulk inserts. + unit: short + queries: + - expr: sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]' + - title: Flow Batching Checkpoint Decisions + type: timeseries + description: Flow batching mode checkpoint state-machine decision rate. + unit: ops + queries: + - expr: sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]' + - title: Flow Batching Query Mode + type: timeseries + description: Flow batching mode query attempts by checkpoint mode. + unit: ops + queries: + - expr: sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval])) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]' - title: Trigger panels: - title: Trigger Count