diff --git a/grafana/dashboards/metrics/cluster/dashboard.json b/grafana/dashboards/metrics/cluster/dashboard.json
index 507d9312e6..bd22a0da59 100644
--- a/grafana/dashboards/metrics/cluster/dashboard.json
+++ b/grafana/dashboards/metrics/cluster/dashboard.json
@@ -3636,6 +3636,19 @@
"legendFormat": "trigger-save-alert",
"range": true,
"refId": "D"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(greptime_flow_batching_error_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "flow-batching",
+ "range": true,
+ "refId": "E"
}
],
"title": "Flow and Trigger Failures",
@@ -17027,6 +17040,770 @@
],
"title": "Flow Processing Error per Instance",
"type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode task attempt, error, and slow-query rates by flow.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "ops"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 178
+ },
+ "id": 558,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error",
+ "range": true,
+ "refId": "B"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow",
+ "range": true,
+ "refId": "C"
+ }
+ ],
+ "title": "Flow Batching Attempt / Error Rate",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode query latency by flow.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 186
+ },
+ "id": 559,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99",
+ "range": true,
+ "refId": "B"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg",
+ "range": true,
+ "refId": "C"
+ }
+ ],
+ "title": "Flow Batching Query Latency",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode query window count by flow.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 8,
+ "y": 186
+ },
+ "id": 560,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Flow Batching Query Window Count",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode query and stalled window sizes by flow.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 186
+ },
+ "id": 561,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Flow Batching Window Size",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode dirty time-window count marked by bulk inserts.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 194
+ },
+ "id": 562,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Flow Batching Bulk Dirty Windows",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode checkpoint state-machine decision rate.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "ops"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 8,
+ "y": 194
+ },
+ "id": 563,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Flow Batching Checkpoint Decisions",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode query attempts by checkpoint mode.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "ops"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 194
+ },
+ "id": 564,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Flow Batching Query Mode",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
}
],
"title": "Flownode",
diff --git a/grafana/dashboards/metrics/cluster/dashboard.md b/grafana/dashboards/metrics/cluster/dashboard.md
index fd7ae68fd4..b8e050114c 100644
--- a/grafana/dashboards/metrics/cluster/dashboard.md
+++ b/grafana/dashboards/metrics/cluster/dashboard.md
@@ -36,7 +36,7 @@
| Scan and Compaction Memory Rejects | `sum(rate(greptime_mito_scan_requests_rejected_total{instance=~"$datanode"}[$__rate_interval]))`
`sum(rate(greptime_mito_scan_memory_exhausted_total{instance=~"$datanode"}[$__rate_interval]))`
`sum(rate(greptime_mito_compaction_memory_rejected_total{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | Datanode scan and compaction memory rejection/exhaustion counters. | `prometheus` | `rps` | `scan-rejected` |
| OpenDAL Errors | `sum by (scheme, operation, error) (rate(opendal_operation_errors_total{instance=~"$datanode",error!="NotFound"}[$__rate_interval]))` | `timeseries` | Object-store errors by scheme, operation, and error, excluding NotFound noise. | `prometheus` | `eps` | `{{scheme}}-{{operation}}-{{error}}` |
| Metasrv Failures | `sum(rate(greptime_meta_region_migration_fail[$__rate_interval]))`
`sum(rate(greptime_meta_reconciliation_procedure_error[$__rate_interval]))` | `timeseries` | Region migration and reconciliation failures in metasrv. | `prometheus` | `eps` | `migration-fail` |
-| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`
`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` |
+| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`
`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))`
`sum(rate(greptime_flow_batching_error_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` |
| Mito GC Failures | `sum(rate(greptime_mito_gc_errors_total{instance=~"$datanode"}[$__rate_interval]))`
`sum(rate(greptime_mito_gc_orphaned_index_files{instance=~"$datanode"}[$__rate_interval]))`
`sum(rate(greptime_mito_gc_skipped_unparsable_files{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | Mito garbage-collection errors and skipped/orphaned files on datanodes. | `prometheus` | `short` | `gc-errors` |
# Capacity
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
@@ -282,6 +282,13 @@ ORDER BY data_size DESC;` | `piechart` | Distribution of leader regions and data
| Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`
`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` |
| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` |
| Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` |
+| Flow Batching Attempt / Error Rate | `sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode task attempt, error, and slow-query rates by flow. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt` |
+| Flow Batching Query Latency | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`
`histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode query latency by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` |
+| Flow Batching Query Window Count | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))` | `timeseries` | Flow batching mode query window count by flow. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` |
+| Flow Batching Window Size | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))`
`histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))` | `timeseries` | Flow batching mode query and stalled window sizes by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95` |
+| Flow Batching Bulk Dirty Windows | `sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)` | `timeseries` | Flow batching mode dirty time-window count marked by bulk inserts. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]` |
+| Flow Batching Checkpoint Decisions | `sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))` | `timeseries` | Flow batching mode checkpoint state-machine decision rate. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]` |
+| Flow Batching Query Mode | `sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))` | `timeseries` | Flow batching mode query attempts by checkpoint mode. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]` |
# Trigger
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
| --- | --- | --- | --- | --- | --- | --- |
diff --git a/grafana/dashboards/metrics/cluster/dashboard.yaml b/grafana/dashboards/metrics/cluster/dashboard.yaml
index 127a8d606b..cd105edfcd 100644
--- a/grafana/dashboards/metrics/cluster/dashboard.yaml
+++ b/grafana/dashboards/metrics/cluster/dashboard.yaml
@@ -535,6 +535,11 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: trigger-save-alert
+ - expr: sum(rate(greptime_flow_batching_error_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: flow-batching
- title: Mito GC Failures
type: timeseries
description: Mito garbage-collection errors and skipped/orphaned files on datanodes.
@@ -2245,6 +2250,106 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]'
+ - title: Flow Batching Attempt / Error Rate
+ type: timeseries
+ description: Flow batching mode task attempt, error, and slow-query rates by flow.
+ unit: ops
+ queries:
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt'
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error'
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow'
+ - title: Flow Batching Query Latency
+ type: timeseries
+ description: Flow batching mode query latency by flow.
+ unit: s
+ queries:
+ - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95'
+ - expr: histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99'
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg'
+ - title: Flow Batching Query Window Count
+ type: timeseries
+ description: Flow batching mode query window count by flow.
+ unit: short
+ queries:
+ - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95'
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg'
+ - title: Flow Batching Window Size
+ type: timeseries
+ description: Flow batching mode query and stalled window sizes by flow.
+ unit: s
+ queries:
+ - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95'
+ - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95'
+ - title: Flow Batching Bulk Dirty Windows
+ type: timeseries
+ description: Flow batching mode dirty time-window count marked by bulk inserts.
+ unit: short
+ queries:
+ - expr: sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]'
+ - title: Flow Batching Checkpoint Decisions
+ type: timeseries
+ description: Flow batching mode checkpoint state-machine decision rate.
+ unit: ops
+ queries:
+ - expr: sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]'
+ - title: Flow Batching Query Mode
+ type: timeseries
+ description: Flow batching mode query attempts by checkpoint mode.
+ unit: ops
+ queries:
+ - expr: sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]'
- title: Trigger
panels:
- title: Trigger Count
diff --git a/grafana/dashboards/metrics/standalone/dashboard.json b/grafana/dashboards/metrics/standalone/dashboard.json
index 376e1665cd..a90d05760e 100644
--- a/grafana/dashboards/metrics/standalone/dashboard.json
+++ b/grafana/dashboards/metrics/standalone/dashboard.json
@@ -3636,6 +3636,19 @@
"legendFormat": "trigger-save-alert",
"range": true,
"refId": "D"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum(rate(greptime_flow_batching_error_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "flow-batching",
+ "range": true,
+ "refId": "E"
}
],
"title": "Flow and Trigger Failures",
@@ -17027,6 +17040,770 @@
],
"title": "Flow Processing Error per Instance",
"type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode task attempt, error, and slow-query rates by flow.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "ops"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 178
+ },
+ "id": 558,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error",
+ "range": true,
+ "refId": "B"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow",
+ "range": true,
+ "refId": "C"
+ }
+ ],
+ "title": "Flow Batching Attempt / Error Rate",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode query latency by flow.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 186
+ },
+ "id": 559,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99",
+ "range": true,
+ "refId": "B"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg",
+ "range": true,
+ "refId": "C"
+ }
+ ],
+ "title": "Flow Batching Query Latency",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode query window count by flow.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 8,
+ "y": 186
+ },
+ "id": 560,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Flow Batching Query Window Count",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode query and stalled window sizes by flow.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "s"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 186
+ },
+ "id": 561,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95",
+ "range": true,
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95",
+ "range": true,
+ "refId": "B"
+ }
+ ],
+ "title": "Flow Batching Window Size",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode dirty time-window count marked by bulk inserts.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 194
+ },
+ "id": 562,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Flow Batching Bulk Dirty Windows",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode checkpoint state-machine decision rate.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "ops"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 8,
+ "y": 194
+ },
+ "id": 563,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Flow Batching Checkpoint Decisions",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "description": "Flow batching mode query attempts by checkpoint mode.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green"
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "ops"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 16,
+ "y": 194
+ },
+ "id": 564,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "desc"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${metrics}"
+ },
+ "editorMode": "code",
+ "expr": "sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Flow Batching Query Mode",
+ "type": "timeseries",
+ "pluginVersion": "11.6.0"
}
],
"title": "Flownode",
diff --git a/grafana/dashboards/metrics/standalone/dashboard.md b/grafana/dashboards/metrics/standalone/dashboard.md
index a3a894cffa..2ed84c5921 100644
--- a/grafana/dashboards/metrics/standalone/dashboard.md
+++ b/grafana/dashboards/metrics/standalone/dashboard.md
@@ -36,7 +36,7 @@
| Scan and Compaction Memory Rejects | `sum(rate(greptime_mito_scan_requests_rejected_total[$__rate_interval]))`
`sum(rate(greptime_mito_scan_memory_exhausted_total[$__rate_interval]))`
`sum(rate(greptime_mito_compaction_memory_rejected_total[$__rate_interval]))` | `timeseries` | Datanode scan and compaction memory rejection/exhaustion counters. | `prometheus` | `rps` | `scan-rejected` |
| OpenDAL Errors | `sum by (scheme, operation, error) (rate(opendal_operation_errors_total{error!="NotFound"}[$__rate_interval]))` | `timeseries` | Object-store errors by scheme, operation, and error, excluding NotFound noise. | `prometheus` | `eps` | `{{scheme}}-{{operation}}-{{error}}` |
| Metasrv Failures | `sum(rate(greptime_meta_region_migration_fail[$__rate_interval]))`
`sum(rate(greptime_meta_reconciliation_procedure_error[$__rate_interval]))` | `timeseries` | Region migration and reconciliation failures in metasrv. | `prometheus` | `eps` | `migration-fail` |
-| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`
`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` |
+| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`
`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`
`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))`
`sum(rate(greptime_flow_batching_error_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` |
| Mito GC Failures | `sum(rate(greptime_mito_gc_errors_total[$__rate_interval]))`
`sum(rate(greptime_mito_gc_orphaned_index_files[$__rate_interval]))`
`sum(rate(greptime_mito_gc_skipped_unparsable_files[$__rate_interval]))` | `timeseries` | Mito garbage-collection errors and skipped/orphaned files on datanodes. | `prometheus` | `short` | `gc-errors` |
# Capacity
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
@@ -282,6 +282,13 @@ ORDER BY data_size DESC;` | `piechart` | Distribution of leader regions and data
| Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`
`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` |
| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` |
| Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` |
+| Flow Batching Attempt / Error Rate | `sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode task attempt, error, and slow-query rates by flow. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt` |
+| Flow Batching Query Latency | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`
`histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode query latency by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` |
+| Flow Batching Query Window Count | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))`
`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))` | `timeseries` | Flow batching mode query window count by flow. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` |
+| Flow Batching Window Size | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))`
`histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))` | `timeseries` | Flow batching mode query and stalled window sizes by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95` |
+| Flow Batching Bulk Dirty Windows | `sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)` | `timeseries` | Flow batching mode dirty time-window count marked by bulk inserts. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]` |
+| Flow Batching Checkpoint Decisions | `sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))` | `timeseries` | Flow batching mode checkpoint state-machine decision rate. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]` |
+| Flow Batching Query Mode | `sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))` | `timeseries` | Flow batching mode query attempts by checkpoint mode. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]` |
# Trigger
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
| --- | --- | --- | --- | --- | --- | --- |
diff --git a/grafana/dashboards/metrics/standalone/dashboard.yaml b/grafana/dashboards/metrics/standalone/dashboard.yaml
index 0ebad934ba..8da2a8fa6f 100644
--- a/grafana/dashboards/metrics/standalone/dashboard.yaml
+++ b/grafana/dashboards/metrics/standalone/dashboard.yaml
@@ -535,6 +535,11 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: trigger-save-alert
+ - expr: sum(rate(greptime_flow_batching_error_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: flow-batching
- title: Mito GC Failures
type: timeseries
description: Mito garbage-collection errors and skipped/orphaned files on datanodes.
@@ -2245,6 +2250,106 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]'
+ - title: Flow Batching Attempt / Error Rate
+ type: timeseries
+ description: Flow batching mode task attempt, error, and slow-query rates by flow.
+ unit: ops
+ queries:
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt'
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error'
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow'
+ - title: Flow Batching Query Latency
+ type: timeseries
+ description: Flow batching mode query latency by flow.
+ unit: s
+ queries:
+ - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95'
+ - expr: histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99'
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg'
+ - title: Flow Batching Query Window Count
+ type: timeseries
+ description: Flow batching mode query window count by flow.
+ unit: short
+ queries:
+ - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95'
+ - expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg'
+ - title: Flow Batching Window Size
+ type: timeseries
+ description: Flow batching mode query and stalled window sizes by flow.
+ unit: s
+ queries:
+ - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95'
+ - expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95'
+ - title: Flow Batching Bulk Dirty Windows
+ type: timeseries
+ description: Flow batching mode dirty time-window count marked by bulk inserts.
+ unit: short
+ queries:
+ - expr: sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]'
+ - title: Flow Batching Checkpoint Decisions
+ type: timeseries
+ description: Flow batching mode checkpoint state-machine decision rate.
+ unit: ops
+ queries:
+ - expr: sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]'
+ - title: Flow Batching Query Mode
+ type: timeseries
+ description: Flow batching mode query attempts by checkpoint mode.
+ unit: ops
+ queries:
+ - expr: sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))
+ datasource:
+ type: prometheus
+ uid: ${metrics}
+ legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]'
- title: Trigger
panels:
- title: Trigger Count