feat: add flow batching metrics to grafana dashboard (#8353)

Signed-off-by: evenyag <realevenyag@gmail.com>
This commit is contained in:
Yingwen
2026-06-24 17:35:25 +08:00
committed by GitHub
parent 619460f742
commit 8513d8d6b6
6 changed files with 1780 additions and 2 deletions

View File

@@ -3636,6 +3636,19 @@
"legendFormat": "trigger-save-alert",
"range": true,
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum(rate(greptime_flow_batching_error_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "flow-batching",
"range": true,
"refId": "E"
}
],
"title": "Flow and Trigger Failures",
@@ -17027,6 +17040,770 @@
],
"title": "Flow Processing Error per Instance",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode task attempt, error, and slow-query rates by flow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 178
},
"id": 558,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow",
"range": true,
"refId": "C"
}
],
"title": "Flow Batching Attempt / Error Rate",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode query latency by flow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 186
},
"id": 559,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg",
"range": true,
"refId": "C"
}
],
"title": "Flow Batching Query Latency",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode query window count by flow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 186
},
"id": 560,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg",
"range": true,
"refId": "B"
}
],
"title": "Flow Batching Query Window Count",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode query and stalled window sizes by flow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 186
},
"id": 561,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95",
"range": true,
"refId": "B"
}
],
"title": "Flow Batching Window Size",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode dirty time-window count marked by bulk inserts.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 194
},
"id": 562,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]",
"range": true,
"refId": "A"
}
],
"title": "Flow Batching Bulk Dirty Windows",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode checkpoint state-machine decision rate.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 194
},
"id": 563,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]",
"range": true,
"refId": "A"
}
],
"title": "Flow Batching Checkpoint Decisions",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode query attempts by checkpoint mode.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 194
},
"id": 564,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]",
"range": true,
"refId": "A"
}
],
"title": "Flow Batching Query Mode",
"type": "timeseries",
"pluginVersion": "11.6.0"
}
],
"title": "Flownode",

View File

@@ -36,7 +36,7 @@
| Scan and Compaction Memory Rejects | `sum(rate(greptime_mito_scan_requests_rejected_total{instance=~"$datanode"}[$__rate_interval]))`<br/>`sum(rate(greptime_mito_scan_memory_exhausted_total{instance=~"$datanode"}[$__rate_interval]))`<br/>`sum(rate(greptime_mito_compaction_memory_rejected_total{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | Datanode scan and compaction memory rejection/exhaustion counters. | `prometheus` | `rps` | `scan-rejected` |
| OpenDAL Errors | `sum by (scheme, operation, error) (rate(opendal_operation_errors_total{instance=~"$datanode",error!="NotFound"}[$__rate_interval]))` | `timeseries` | Object-store errors by scheme, operation, and error, excluding NotFound noise. | `prometheus` | `eps` | `{{scheme}}-{{operation}}-{{error}}` |
| Metasrv Failures | `sum(rate(greptime_meta_region_migration_fail[$__rate_interval]))`<br/>`sum(rate(greptime_meta_reconciliation_procedure_error[$__rate_interval]))` | `timeseries` | Region migration and reconciliation failures in metasrv. | `prometheus` | `eps` | `migration-fail` |
| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` |
| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_flow_batching_error_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` |
| Mito GC Failures | `sum(rate(greptime_mito_gc_errors_total{instance=~"$datanode"}[$__rate_interval]))`<br/>`sum(rate(greptime_mito_gc_orphaned_index_files{instance=~"$datanode"}[$__rate_interval]))`<br/>`sum(rate(greptime_mito_gc_skipped_unparsable_files{instance=~"$datanode"}[$__rate_interval]))` | `timeseries` | Mito garbage-collection errors and skipped/orphaned files on datanodes. | `prometheus` | `short` | `gc-errors` |
# Capacity
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
@@ -282,6 +282,13 @@ ORDER BY data_size DESC;` | `piechart` | Distribution of leader regions and data
| Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`<br/>`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` |
| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` |
| Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` |
| Flow Batching Attempt / Error Rate | `sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))`<br/>`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))`<br/>`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode task attempt, error, and slow-query rates by flow. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt` |
| Flow Batching Query Latency | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`<br/>`histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`<br/>`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode query latency by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` |
| Flow Batching Query Window Count | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))`<br/>`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))` | `timeseries` | Flow batching mode query window count by flow. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` |
| Flow Batching Window Size | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))`<br/>`histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))` | `timeseries` | Flow batching mode query and stalled window sizes by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95` |
| Flow Batching Bulk Dirty Windows | `sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)` | `timeseries` | Flow batching mode dirty time-window count marked by bulk inserts. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]` |
| Flow Batching Checkpoint Decisions | `sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))` | `timeseries` | Flow batching mode checkpoint state-machine decision rate. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]` |
| Flow Batching Query Mode | `sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))` | `timeseries` | Flow batching mode query attempts by checkpoint mode. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]` |
# Trigger
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
| --- | --- | --- | --- | --- | --- | --- |

View File

@@ -535,6 +535,11 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: trigger-save-alert
- expr: sum(rate(greptime_flow_batching_error_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: flow-batching
- title: Mito GC Failures
type: timeseries
description: Mito garbage-collection errors and skipped/orphaned files on datanodes.
@@ -2245,6 +2250,106 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]'
- title: Flow Batching Attempt / Error Rate
type: timeseries
description: Flow batching mode task attempt, error, and slow-query rates by flow.
unit: ops
queries:
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt'
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error'
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow'
- title: Flow Batching Query Latency
type: timeseries
description: Flow batching mode query latency by flow.
unit: s
queries:
- expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95'
- expr: histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99'
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg'
- title: Flow Batching Query Window Count
type: timeseries
description: Flow batching mode query window count by flow.
unit: short
queries:
- expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95'
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg'
- title: Flow Batching Window Size
type: timeseries
description: Flow batching mode query and stalled window sizes by flow.
unit: s
queries:
- expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95'
- expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95'
- title: Flow Batching Bulk Dirty Windows
type: timeseries
description: Flow batching mode dirty time-window count marked by bulk inserts.
unit: short
queries:
- expr: sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]'
- title: Flow Batching Checkpoint Decisions
type: timeseries
description: Flow batching mode checkpoint state-machine decision rate.
unit: ops
queries:
- expr: sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]'
- title: Flow Batching Query Mode
type: timeseries
description: Flow batching mode query attempts by checkpoint mode.
unit: ops
queries:
- expr: sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]'
- title: Trigger
panels:
- title: Trigger Count

View File

@@ -3636,6 +3636,19 @@
"legendFormat": "trigger-save-alert",
"range": true,
"refId": "D"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum(rate(greptime_flow_batching_error_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "flow-batching",
"range": true,
"refId": "E"
}
],
"title": "Flow and Trigger Failures",
@@ -17027,6 +17040,770 @@
],
"title": "Flow Processing Error per Instance",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode task attempt, error, and slow-query rates by flow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 178
},
"id": 558,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow",
"range": true,
"refId": "C"
}
],
"title": "Flow Batching Attempt / Error Rate",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode query latency by flow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 186
},
"id": 559,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg",
"range": true,
"refId": "C"
}
],
"title": "Flow Batching Query Latency",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode query window count by flow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 186
},
"id": 560,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg",
"range": true,
"refId": "B"
}
],
"title": "Flow Batching Query Window Count",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode query and stalled window sizes by flow.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 186
},
"id": 561,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95",
"range": true,
"refId": "B"
}
],
"title": "Flow Batching Window Size",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode dirty time-window count marked by bulk inserts.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 194
},
"id": 562,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]",
"range": true,
"refId": "A"
}
],
"title": "Flow Batching Bulk Dirty Windows",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode checkpoint state-machine decision rate.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 194
},
"id": 563,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]",
"range": true,
"refId": "A"
}
],
"title": "Flow Batching Checkpoint Decisions",
"type": "timeseries",
"pluginVersion": "11.6.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"description": "Flow batching mode query attempts by checkpoint mode.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 194
},
"id": 564,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${metrics}"
},
"editorMode": "code",
"expr": "sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))",
"hide": false,
"instant": false,
"legendFormat": "[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]",
"range": true,
"refId": "A"
}
],
"title": "Flow Batching Query Mode",
"type": "timeseries",
"pluginVersion": "11.6.0"
}
],
"title": "Flownode",

View File

@@ -36,7 +36,7 @@
| Scan and Compaction Memory Rejects | `sum(rate(greptime_mito_scan_requests_rejected_total[$__rate_interval]))`<br/>`sum(rate(greptime_mito_scan_memory_exhausted_total[$__rate_interval]))`<br/>`sum(rate(greptime_mito_compaction_memory_rejected_total[$__rate_interval]))` | `timeseries` | Datanode scan and compaction memory rejection/exhaustion counters. | `prometheus` | `rps` | `scan-rejected` |
| OpenDAL Errors | `sum by (scheme, operation, error) (rate(opendal_operation_errors_total{error!="NotFound"}[$__rate_interval]))` | `timeseries` | Object-store errors by scheme, operation, and error, excluding NotFound noise. | `prometheus` | `eps` | `{{scheme}}-{{operation}}-{{error}}` |
| Metasrv Failures | `sum(rate(greptime_meta_region_migration_fail[$__rate_interval]))`<br/>`sum(rate(greptime_meta_reconciliation_procedure_error[$__rate_interval]))` | `timeseries` | Region migration and reconciliation failures in metasrv. | `prometheus` | `eps` | `migration-fail` |
| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` |
| Flow and Trigger Failures | `sum by (code) (rate(greptime_flow_errors[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_evaluate_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_send_alert_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]))`<br/>`sum(rate(greptime_flow_batching_error_count[$__rate_interval]))` | `timeseries` | Derived-data and alerting pipeline failures. | `prometheus` | `eps` | `flow-{{code}}` |
| Mito GC Failures | `sum(rate(greptime_mito_gc_errors_total[$__rate_interval]))`<br/>`sum(rate(greptime_mito_gc_orphaned_index_files[$__rate_interval]))`<br/>`sum(rate(greptime_mito_gc_skipped_unparsable_files[$__rate_interval]))` | `timeseries` | Mito garbage-collection errors and skipped/orphaned files on datanodes. | `prometheus` | `short` | `gc-errors` |
# Capacity
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
@@ -282,6 +282,13 @@ ORDER BY data_size DESC;` | `piechart` | Distribution of leader regions and data
| Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`<br/>`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` |
| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` |
| Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` |
| Flow Batching Attempt / Error Rate | `sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))`<br/>`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))`<br/>`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode task attempt, error, and slow-query rates by flow. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt` |
| Flow Batching Query Latency | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`<br/>`histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))`<br/>`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))` | `timeseries` | Flow batching mode query latency by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` |
| Flow Batching Query Window Count | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))`<br/>`sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))` | `timeseries` | Flow batching mode query window count by flow. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95` |
| Flow Batching Window Size | `histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))`<br/>`histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))` | `timeseries` | Flow batching mode query and stalled window sizes by flow. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95` |
| Flow Batching Bulk Dirty Windows | `sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)` | `timeseries` | Flow batching mode dirty time-window count marked by bulk inserts. | `prometheus` | `short` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]` |
| Flow Batching Checkpoint Decisions | `sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))` | `timeseries` | Flow batching mode checkpoint state-machine decision rate. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]` |
| Flow Batching Query Mode | `sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))` | `timeseries` | Flow batching mode query attempts by checkpoint mode. | `prometheus` | `ops` | `[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]` |
# Trigger
| Title | Query | Type | Description | Datasource | Unit | Legend Format |
| --- | --- | --- | --- | --- | --- | --- |

View File

@@ -535,6 +535,11 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: trigger-save-alert
- expr: sum(rate(greptime_flow_batching_error_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: flow-batching
- title: Mito GC Failures
type: timeseries
description: Mito garbage-collection errors and skipped/orphaned files on datanodes.
@@ -2245,6 +2250,106 @@ groups:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]'
- title: Flow Batching Attempt / Error Rate
type: timeseries
description: Flow batching mode task attempt, error, and slow-query rates by flow.
unit: ops
queries:
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_start_query_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-attempt'
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_error_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-error'
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_slow_query_secs_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-slow'
- title: Flow Batching Query Latency
type: timeseries
description: Flow batching mode query latency by flow.
unit: s
queries:
- expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95'
- expr: histogram_quantile(0.99, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p99'
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_time_secs_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg'
- title: Flow Batching Query Window Count
type: timeseries
description: Flow batching mode query window count by flow.
unit: short
queries:
- expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-p95'
- expr: sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_sum[$__rate_interval])) / sum by(instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_cnt_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-avg'
- title: Flow Batching Window Size
type: timeseries
description: Flow batching mode query and stalled window sizes by flow.
unit: s
queries:
- expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_query_window_size_secs_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-query-p95'
- expr: histogram_quantile(0.95, sum by(le,instance,pod,flow_id) (rate(greptime_flow_batching_engine_stalled_window_size_secs_bucket[$__rate_interval])))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-stalled-p95'
- title: Flow Batching Bulk Dirty Windows
type: timeseries
description: Flow batching mode dirty time-window count marked by bulk inserts.
unit: short
queries:
- expr: sum by(instance,pod,flow_id) (greptime_flow_batching_engine_bulk_mark_time_window)
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]'
- title: Flow Batching Checkpoint Decisions
type: timeseries
description: Flow batching mode checkpoint state-machine decision rate.
unit: ops
queries:
- expr: sum by(instance,pod,flow_id,mode,decision,reason) (rate(greptime_flow_batching_checkpoint_decision_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]-[{{decision}}]-[{{reason}}]'
- title: Flow Batching Query Mode
type: timeseries
description: Flow batching mode query attempts by checkpoint mode.
unit: ops
queries:
- expr: sum by(instance,pod,flow_id,mode) (rate(greptime_flow_batching_query_mode_count[$__rate_interval]))
datasource:
type: prometheus
uid: ${metrics}
legendFormat: '[{{instance}}]-[{{pod}}]-[flow={{flow_id}}]-[{{mode}}]'
- title: Trigger
panels:
- title: Trigger Count