diff --git a/grafana/dashboards/metrics/cluster/dashboard.json b/grafana/dashboards/metrics/cluster/dashboard.json index 79c037dc12..918eb3c4e8 100644 --- a/grafana/dashboards/metrics/cluster/dashboard.json +++ b/grafana/dashboards/metrics/cluster/dashboard.json @@ -8863,7 +8863,7 @@ "type": "prometheus", "uid": "${metrics}" }, - "description": "Elapsed of Reconciliation steps ", + "description": "Elapsed of Reconciliation steps", "fieldConfig": { "defaults": { "color": { @@ -9366,7 +9366,7 @@ "editorMode": "code", "expr": "greptime_flow_input_buf_size", "instant": false, - "legendFormat": "[{{instance}}]-[{{pod}]", + "legendFormat": "[{{instance}}]-[{{pod}}]", "range": true, "refId": "A" } @@ -9472,6 +9472,755 @@ ], "title": "Flownode", "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 187 + }, + "id": 357, + "panels": [], + "title": "Trigger", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bf9fzta69bhtsa" + }, + "description": "Total number of triggers currently defined.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 188 + }, + "id": 358, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "greptime_trigger_count{}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Trigger Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Elapsed time for trigger evaluation, including query execution and condition evaluation.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 196 + }, + "id": 359, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, \n rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-p99", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, \n rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-p75", + "range": true, + "refId": "B" + } + ], + "title": "Trigger Eval Elapsed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Rate of failed trigger evaluations.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 196 + }, + "id": 360, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "rate(greptime_trigger_evaluate_failure_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Trigger Eval Failure Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Elapsed time to send trigger alerts to notification channels.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 204 + }, + "id": 361, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, \n rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, \n rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p75", + "range": true, + "refId": "B" + } + ], + "title": "Send Alert Elapsed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Rate of failures when sending trigger alerts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 204 + }, + "id": 364, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "rate(greptime_trigger_send_alert_failure_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Send Alert Failure Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Elapsed time to persist trigger alert records.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 212 + }, + "id": 363, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, \n rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, \n rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p75", + "range": true, + "refId": "B" + } + ], + "title": "Save Alert Elapsed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Rate of failures when persisting trigger alert records.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 212 + }, + "id": 362, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Save Alert Failure Rate", + "type": "timeseries" } ], "preload": false, @@ -9613,4 +10362,4 @@ "title": "GreptimeDB", "uid": "dejf3k5e7g2kgb", "version": 15 -} \ No newline at end of file +} diff --git a/grafana/dashboards/metrics/cluster/dashboard.md b/grafana/dashboards/metrics/cluster/dashboard.md index e57276779d..c0bc49770b 100644 --- a/grafana/dashboards/metrics/cluster/dashboard.md +++ b/grafana/dashboards/metrics/cluster/dashboard.md @@ -111,12 +111,34 @@ | Rate of meta KV Ops | `rate(greptime_meta_kv_request_elapsed_count[$__rate_interval])` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `none` | `{{pod}}-{{op}} p99` | | DDL Latency | `histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_tables_bucket))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_table))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_view))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_flow))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_drop_table))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_alter_table))` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `s` | `CreateLogicalTables-{{step}} p90` | | Reconciliation stats | `greptime_meta_reconciliation_stats` | `timeseries` | Reconciliation stats | `prometheus` | `s` | `{{pod}}-{{table_type}}-{{type}}` | -| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` | +| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` | # Flownode | Title | Query | Type | Description | Datasource | Unit | Legend Format | | --- | --- | --- | --- | --- | --- | --- | | Flow Ingest / Output Rate | `sum by(instance, pod, direction) (rate(greptime_flow_processed_rows[$__rate_interval]))` | `timeseries` | Flow Ingest / Output Rate. | `prometheus` | -- | `[{{pod}}]-[{{instance}}]-[{{direction}}]` | | Flow Ingest Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))`
`histogram_quantile(0.99, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))` | `timeseries` | Flow Ingest Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-p95` | | Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`
`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` | -| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}]` | +| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` | | Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` | +# Trigger +| Title | Query | Type | Description | Datasource | Unit | Legend Format | +| --- | --- | --- | --- | --- | --- | --- | +| Trigger Count | `greptime_trigger_count{}` | `timeseries` | Total number of triggers currently defined. | `prometheus` | -- | `__auto` | +| Trigger Eval Elapsed | `histogram_quantile(0.99, + rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval]) +)`
`histogram_quantile(0.75, + rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval]) +)` | `timeseries` | Elapsed time for trigger evaluation, including query execution and condition evaluation. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-p99` | +| Trigger Eval Failure Rate | `rate(greptime_trigger_evaluate_failure_count[$__rate_interval])` | `timeseries` | Rate of failed trigger evaluations. | `prometheus` | `none` | `__auto` | +| Send Alert Elapsed | `histogram_quantile(0.99, + rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval]) +)`
`histogram_quantile(0.75, + rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval]) +)` | `timeseries` | Elapsed time to send trigger alerts to notification channels. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99` | +| Send Alert Failure Rate | `rate(greptime_trigger_send_alert_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when sending trigger alerts. | `prometheus` | `none` | `__auto` | +| Save Alert Elapsed | `histogram_quantile(0.99, + rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval]) +)`
`histogram_quantile(0.75, + rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval]) +)` | `timeseries` | Elapsed time to persist trigger alert records. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99` | +| Save Alert Failure Rate | `rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when persisting trigger alert records. | `prometheus` | `none` | `__auto` | diff --git a/grafana/dashboards/metrics/cluster/dashboard.yaml b/grafana/dashboards/metrics/cluster/dashboard.yaml index 0173ef456b..934cf58c0c 100644 --- a/grafana/dashboards/metrics/cluster/dashboard.yaml +++ b/grafana/dashboards/metrics/cluster/dashboard.yaml @@ -1002,7 +1002,7 @@ groups: legendFormat: '{{pod}}-{{table_type}}-{{type}}' - title: Reconciliation steps type: timeseries - description: 'Elapsed of Reconciliation steps ' + description: Elapsed of Reconciliation steps unit: s queries: - expr: histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket) @@ -1057,7 +1057,7 @@ groups: datasource: type: prometheus uid: ${metrics} - legendFormat: '[{{instance}}]-[{{pod}]' + legendFormat: '[{{instance}}]-[{{pod}}]' - title: Flow Processing Error per Instance type: timeseries description: Flow Processing Error per Instance. @@ -1067,3 +1067,89 @@ groups: type: prometheus uid: ${metrics} legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]' + - title: Trigger + panels: + - title: Trigger Count + type: timeseries + description: Total number of triggers currently defined. + queries: + - expr: greptime_trigger_count{} + datasource: + type: prometheus + uid: ${metrics} + legendFormat: __auto + - title: Trigger Eval Elapsed + type: timeseries + description: Elapsed time for trigger evaluation, including query execution and condition evaluation. + unit: s + queries: + - expr: "histogram_quantile(0.99, \n rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-p99' + - expr: "histogram_quantile(0.75, \n rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-p75' + - title: Trigger Eval Failure Rate + type: timeseries + description: Rate of failed trigger evaluations. + unit: none + queries: + - expr: rate(greptime_trigger_evaluate_failure_count[$__rate_interval]) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: __auto + - title: Send Alert Elapsed + type: timeseries + description: Elapsed time to send trigger alerts to notification channels. + unit: s + queries: + - expr: "histogram_quantile(0.99, \n rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99' + - expr: "histogram_quantile(0.75, \n rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p75' + - title: Send Alert Failure Rate + type: timeseries + description: Rate of failures when sending trigger alerts. + unit: none + queries: + - expr: rate(greptime_trigger_send_alert_failure_count[$__rate_interval]) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: __auto + - title: Save Alert Elapsed + type: timeseries + description: Elapsed time to persist trigger alert records. + unit: s + queries: + - expr: "histogram_quantile(0.99, \n rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99' + - expr: "histogram_quantile(0.75, \n rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p75' + - title: Save Alert Failure Rate + type: timeseries + description: Rate of failures when persisting trigger alert records. + unit: none + queries: + - expr: rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: __auto diff --git a/grafana/dashboards/metrics/standalone/dashboard.json b/grafana/dashboards/metrics/standalone/dashboard.json index a88f784101..28f38f9dd7 100644 --- a/grafana/dashboards/metrics/standalone/dashboard.json +++ b/grafana/dashboards/metrics/standalone/dashboard.json @@ -8863,7 +8863,7 @@ "type": "prometheus", "uid": "${metrics}" }, - "description": "Elapsed of Reconciliation steps ", + "description": "Elapsed of Reconciliation steps", "fieldConfig": { "defaults": { "color": { @@ -9366,7 +9366,7 @@ "editorMode": "code", "expr": "greptime_flow_input_buf_size", "instant": false, - "legendFormat": "[{{instance}}]-[{{pod}]", + "legendFormat": "[{{instance}}]-[{{pod}}]", "range": true, "refId": "A" } @@ -9472,6 +9472,755 @@ ], "title": "Flownode", "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 187 + }, + "id": 357, + "panels": [], + "title": "Trigger", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "bf9fzta69bhtsa" + }, + "description": "Total number of triggers currently defined.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 188 + }, + "id": 358, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "greptime_trigger_count{}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Trigger Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Elapsed time for trigger evaluation, including query execution and condition evaluation.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 196 + }, + "id": 359, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, \n rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-p99", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, \n rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-p75", + "range": true, + "refId": "B" + } + ], + "title": "Trigger Eval Elapsed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Rate of failed trigger evaluations.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 196 + }, + "id": 360, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "rate(greptime_trigger_evaluate_failure_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Trigger Eval Failure Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Elapsed time to send trigger alerts to notification channels.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 204 + }, + "id": 361, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, \n rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, \n rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p75", + "range": true, + "refId": "B" + } + ], + "title": "Send Alert Elapsed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Rate of failures when sending trigger alerts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 204 + }, + "id": 364, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "rate(greptime_trigger_send_alert_failure_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Send Alert Failure Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Elapsed time to persist trigger alert records.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 212 + }, + "id": 363, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, \n rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.75, \n rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)", + "hide": false, + "instant": false, + "legendFormat": "[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p75", + "range": true, + "refId": "B" + } + ], + "title": "Save Alert Elapsed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "description": "Rate of failures when persisting trigger alert records.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 212 + }, + "id": 362, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${metrics}" + }, + "editorMode": "code", + "expr": "rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Save Alert Failure Rate", + "type": "timeseries" } ], "preload": false, @@ -9613,4 +10362,4 @@ "title": "GreptimeDB", "uid": "dejf3k5e7g2kgb", "version": 15 -} \ No newline at end of file +} diff --git a/grafana/dashboards/metrics/standalone/dashboard.md b/grafana/dashboards/metrics/standalone/dashboard.md index 09a62681db..63b45208c5 100644 --- a/grafana/dashboards/metrics/standalone/dashboard.md +++ b/grafana/dashboards/metrics/standalone/dashboard.md @@ -111,12 +111,34 @@ | Rate of meta KV Ops | `rate(greptime_meta_kv_request_elapsed_count[$__rate_interval])` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `none` | `{{pod}}-{{op}} p99` | | DDL Latency | `histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_tables_bucket))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_table))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_view))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_flow))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_drop_table))`
`histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_alter_table))` | `timeseries` | Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads. | `prometheus` | `s` | `CreateLogicalTables-{{step}} p90` | | Reconciliation stats | `greptime_meta_reconciliation_stats` | `timeseries` | Reconciliation stats | `prometheus` | `s` | `{{pod}}-{{table_type}}-{{type}}` | -| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` | +| Reconciliation steps | `histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)` | `timeseries` | Elapsed of Reconciliation steps | `prometheus` | `s` | `{{procedure_name}}-{{step}}-P90` | # Flownode | Title | Query | Type | Description | Datasource | Unit | Legend Format | | --- | --- | --- | --- | --- | --- | --- | | Flow Ingest / Output Rate | `sum by(instance, pod, direction) (rate(greptime_flow_processed_rows[$__rate_interval]))` | `timeseries` | Flow Ingest / Output Rate. | `prometheus` | -- | `[{{pod}}]-[{{instance}}]-[{{direction}}]` | | Flow Ingest Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))`
`histogram_quantile(0.99, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))` | `timeseries` | Flow Ingest Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-p95` | | Flow Operation Latency | `histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))`
`histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))` | `timeseries` | Flow Operation Latency. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{type}}]-p95` | -| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}]` | +| Flow Buffer Size per Instance | `greptime_flow_input_buf_size` | `timeseries` | Flow Buffer Size per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]` | | Flow Processing Error per Instance | `sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))` | `timeseries` | Flow Processing Error per Instance. | `prometheus` | -- | `[{{instance}}]-[{{pod}}]-[{{code}}]` | +# Trigger +| Title | Query | Type | Description | Datasource | Unit | Legend Format | +| --- | --- | --- | --- | --- | --- | --- | +| Trigger Count | `greptime_trigger_count{}` | `timeseries` | Total number of triggers currently defined. | `prometheus` | -- | `__auto` | +| Trigger Eval Elapsed | `histogram_quantile(0.99, + rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval]) +)`
`histogram_quantile(0.75, + rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval]) +)` | `timeseries` | Elapsed time for trigger evaluation, including query execution and condition evaluation. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-p99` | +| Trigger Eval Failure Rate | `rate(greptime_trigger_evaluate_failure_count[$__rate_interval])` | `timeseries` | Rate of failed trigger evaluations. | `prometheus` | `none` | `__auto` | +| Send Alert Elapsed | `histogram_quantile(0.99, + rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval]) +)`
`histogram_quantile(0.75, + rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval]) +)` | `timeseries` | Elapsed time to send trigger alerts to notification channels. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99` | +| Send Alert Failure Rate | `rate(greptime_trigger_send_alert_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when sending trigger alerts. | `prometheus` | `none` | `__auto` | +| Save Alert Elapsed | `histogram_quantile(0.99, + rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval]) +)`
`histogram_quantile(0.75, + rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval]) +)` | `timeseries` | Elapsed time to persist trigger alert records. | `prometheus` | `s` | `[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99` | +| Save Alert Failure Rate | `rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval])` | `timeseries` | Rate of failures when persisting trigger alert records. | `prometheus` | `none` | `__auto` | diff --git a/grafana/dashboards/metrics/standalone/dashboard.yaml b/grafana/dashboards/metrics/standalone/dashboard.yaml index f3013589cd..32f771f66f 100644 --- a/grafana/dashboards/metrics/standalone/dashboard.yaml +++ b/grafana/dashboards/metrics/standalone/dashboard.yaml @@ -1002,7 +1002,7 @@ groups: legendFormat: '{{pod}}-{{table_type}}-{{type}}' - title: Reconciliation steps type: timeseries - description: 'Elapsed of Reconciliation steps ' + description: Elapsed of Reconciliation steps unit: s queries: - expr: histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket) @@ -1057,7 +1057,7 @@ groups: datasource: type: prometheus uid: ${metrics} - legendFormat: '[{{instance}}]-[{{pod}]' + legendFormat: '[{{instance}}]-[{{pod}}]' - title: Flow Processing Error per Instance type: timeseries description: Flow Processing Error per Instance. @@ -1067,3 +1067,89 @@ groups: type: prometheus uid: ${metrics} legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]' + - title: Trigger + panels: + - title: Trigger Count + type: timeseries + description: Total number of triggers currently defined. + queries: + - expr: greptime_trigger_count{} + datasource: + type: prometheus + uid: ${metrics} + legendFormat: __auto + - title: Trigger Eval Elapsed + type: timeseries + description: Elapsed time for trigger evaluation, including query execution and condition evaluation. + unit: s + queries: + - expr: "histogram_quantile(0.99, \n rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-p99' + - expr: "histogram_quantile(0.75, \n rate(greptime_trigger_evaluate_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-p75' + - title: Trigger Eval Failure Rate + type: timeseries + description: Rate of failed trigger evaluations. + unit: none + queries: + - expr: rate(greptime_trigger_evaluate_failure_count[$__rate_interval]) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: __auto + - title: Send Alert Elapsed + type: timeseries + description: Elapsed time to send trigger alerts to notification channels. + unit: s + queries: + - expr: "histogram_quantile(0.99, \n rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p99' + - expr: "histogram_quantile(0.75, \n rate(greptime_trigger_send_alert_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[{{channel_type}}]-p75' + - title: Send Alert Failure Rate + type: timeseries + description: Rate of failures when sending trigger alerts. + unit: none + queries: + - expr: rate(greptime_trigger_send_alert_failure_count[$__rate_interval]) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: __auto + - title: Save Alert Elapsed + type: timeseries + description: Elapsed time to persist trigger alert records. + unit: s + queries: + - expr: "histogram_quantile(0.99, \n rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p99' + - expr: "histogram_quantile(0.75, \n rate(greptime_trigger_save_alert_record_elapsed_bucket[$__rate_interval])\n)" + datasource: + type: prometheus + uid: ${metrics} + legendFormat: '[{{instance}}]-[{{pod}}]-[{{storage_type}}]-p75' + - title: Save Alert Failure Rate + type: timeseries + description: Rate of failures when persisting trigger alert records. + unit: none + queries: + - expr: rate(greptime_trigger_save_alert_record_failure_count[$__rate_interval]) + datasource: + type: prometheus + uid: ${metrics} + legendFormat: __auto