mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-22 22:20:02 +00:00
fix: remove unnecessary labels of standalone dashboard.json Signed-off-by: zyy17 <zyylsxm@gmail.com>
1070 lines
48 KiB
YAML
1070 lines
48 KiB
YAML
groups:
|
|
- title: Overview
|
|
panels:
|
|
- title: Uptime
|
|
type: stat
|
|
description: The start time of GreptimeDB.
|
|
unit: s
|
|
queries:
|
|
- expr: time() - process_start_time_seconds
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: __auto
|
|
- title: Version
|
|
type: stat
|
|
description: GreptimeDB version.
|
|
queries:
|
|
- expr: SELECT pkg_version FROM information_schema.build_info
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- title: Total Ingestion Rate
|
|
type: stat
|
|
description: Total ingestion rate.
|
|
unit: rowsps
|
|
queries:
|
|
- expr: sum(rate(greptime_table_operator_ingest_rows[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: __auto
|
|
- title: Total Storage Size
|
|
type: stat
|
|
description: Total number of data file size.
|
|
unit: decbytes
|
|
queries:
|
|
- expr: select SUM(disk_size) from information_schema.region_statistics;
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- title: Total Rows
|
|
type: stat
|
|
description: Total number of data rows in the cluster. Calculated by sum of rows from each region.
|
|
unit: sishort
|
|
queries:
|
|
- expr: select SUM(region_rows) from information_schema.region_statistics;
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- title: Deployment
|
|
type: stat
|
|
description: The deployment topology of GreptimeDB.
|
|
queries:
|
|
- expr: SELECT count(*) as datanode FROM information_schema.cluster_info WHERE peer_type = 'DATANODE';
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- expr: SELECT count(*) as frontend FROM information_schema.cluster_info WHERE peer_type = 'FRONTEND';
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- expr: SELECT count(*) as metasrv FROM information_schema.cluster_info WHERE peer_type = 'METASRV';
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- expr: SELECT count(*) as flownode FROM information_schema.cluster_info WHERE peer_type = 'FLOWNODE';
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- title: Database Resources
|
|
type: stat
|
|
description: The number of the key resources in GreptimeDB.
|
|
queries:
|
|
- expr: SELECT COUNT(*) as databases FROM information_schema.schemata WHERE schema_name NOT IN ('greptime_private', 'information_schema')
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- expr: SELECT COUNT(*) as tables FROM information_schema.tables WHERE table_schema != 'information_schema'
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- expr: SELECT COUNT(region_id) as regions FROM information_schema.region_peers
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- expr: SELECT COUNT(*) as flows FROM information_schema.flows
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- title: Data Size
|
|
type: stat
|
|
description: The data size of wal/index/manifest in the GreptimeDB.
|
|
unit: decbytes
|
|
queries:
|
|
- expr: SELECT SUM(memtable_size) * 0.42825 as WAL FROM information_schema.region_statistics;
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- expr: SELECT SUM(index_size) as index FROM information_schema.region_statistics;
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- expr: SELECT SUM(manifest_size) as manifest FROM information_schema.region_statistics;
|
|
datasource:
|
|
type: mysql
|
|
uid: ${information_schema}
|
|
- title: Ingestion
|
|
panels:
|
|
- title: Total Ingestion Rate
|
|
type: timeseries
|
|
description: |
|
|
Total ingestion rate.
|
|
|
|
Here we listed 3 primary protocols:
|
|
|
|
- Prometheus remote write
|
|
- Greptime's gRPC API (when using our ingest SDK)
|
|
- Log ingestion http API
|
|
unit: rowsps
|
|
queries:
|
|
- expr: sum(rate(greptime_table_operator_ingest_rows{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: ingestion
|
|
- title: Ingestion Rate by Type
|
|
type: timeseries
|
|
description: |
|
|
Total ingestion rate.
|
|
|
|
Here we listed 3 primary protocols:
|
|
|
|
- Prometheus remote write
|
|
- Greptime's gRPC API (when using our ingest SDK)
|
|
- Log ingestion http API
|
|
unit: rowsps
|
|
queries:
|
|
- expr: sum(rate(greptime_servers_http_logs_ingestion_counter[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: http-logs
|
|
- expr: sum(rate(greptime_servers_prometheus_remote_write_samples[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: prometheus-remote-write
|
|
- title: Queries
|
|
panels:
|
|
- title: Total Query Rate
|
|
type: timeseries
|
|
description: |-
|
|
Total rate of query API calls by protocol. This metric is collected from frontends.
|
|
|
|
Here we listed 3 main protocols:
|
|
- MySQL
|
|
- Postgres
|
|
- Prometheus API
|
|
|
|
Note that there are some other minor query APIs like /sql are not included
|
|
unit: reqps
|
|
queries:
|
|
- expr: sum (rate(greptime_servers_mysql_query_elapsed_count{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: mysql
|
|
- expr: sum (rate(greptime_servers_postgres_query_elapsed_count{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: pg
|
|
- expr: sum (rate(greptime_servers_http_promql_elapsed_counte{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: promql
|
|
- title: Resources
|
|
panels:
|
|
- title: Datanode Memory per Instance
|
|
type: timeseries
|
|
description: Current memory usage by instance
|
|
unit: bytes
|
|
queries:
|
|
- expr: sum(process_resident_memory_bytes{instance=~"$datanode"}) by (instance, pod)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{ pod }}]'
|
|
- expr: max(greptime_memory_limit_in_bytes{instance=~"$datanode"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: limit
|
|
- title: Datanode CPU Usage per Instance
|
|
type: timeseries
|
|
description: Current cpu usage by instance
|
|
unit: none
|
|
queries:
|
|
- expr: sum(rate(process_cpu_seconds_total{instance=~"$datanode"}[$__rate_interval]) * 1000) by (instance, pod)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{ pod }}]'
|
|
- expr: max(greptime_cpu_limit_in_millicores{instance=~"$datanode"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: limit
|
|
- title: Frontend Memory per Instance
|
|
type: timeseries
|
|
description: Current memory usage by instance
|
|
unit: bytes
|
|
queries:
|
|
- expr: sum(process_resident_memory_bytes{instance=~"$frontend"}) by (instance, pod)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{ pod }}]'
|
|
- expr: max(greptime_memory_limit_in_bytes{instance=~"$frontend"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: limit
|
|
- title: Frontend CPU Usage per Instance
|
|
type: timeseries
|
|
description: Current cpu usage by instance
|
|
unit: none
|
|
queries:
|
|
- expr: sum(rate(process_cpu_seconds_total{instance=~"$frontend"}[$__rate_interval]) * 1000) by (instance, pod)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{ pod }}]-cpu'
|
|
- expr: max(greptime_cpu_limit_in_millicores{instance=~"$frontend"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: limit
|
|
- title: Metasrv Memory per Instance
|
|
type: timeseries
|
|
description: Current memory usage by instance
|
|
unit: bytes
|
|
queries:
|
|
- expr: sum(process_resident_memory_bytes{instance=~"$metasrv"}) by (instance, pod)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{ pod }}]-resident'
|
|
- expr: max(greptime_memory_limit_in_bytes{instance=~"$metasrv"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: limit
|
|
- title: Metasrv CPU Usage per Instance
|
|
type: timeseries
|
|
description: Current cpu usage by instance
|
|
unit: none
|
|
queries:
|
|
- expr: sum(rate(process_cpu_seconds_total{instance=~"$metasrv"}[$__rate_interval]) * 1000) by (instance, pod)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{ pod }}]'
|
|
- expr: max(greptime_cpu_limit_in_millicores{instance=~"$metasrv"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: limit
|
|
- title: Flownode Memory per Instance
|
|
type: timeseries
|
|
description: Current memory usage by instance
|
|
unit: bytes
|
|
queries:
|
|
- expr: sum(process_resident_memory_bytes{instance=~"$flownode"}) by (instance, pod)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{ pod }}]'
|
|
- expr: max(greptime_memory_limit_in_bytes{instance=~"$flownode"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: limit
|
|
- title: Flownode CPU Usage per Instance
|
|
type: timeseries
|
|
description: Current cpu usage by instance
|
|
unit: none
|
|
queries:
|
|
- expr: sum(rate(process_cpu_seconds_total{instance=~"$flownode"}[$__rate_interval]) * 1000) by (instance, pod)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{ pod }}]'
|
|
- expr: max(greptime_cpu_limit_in_millicores{instance=~"$flownode"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: limit
|
|
- title: Frontend Requests
|
|
panels:
|
|
- title: HTTP QPS per Instance
|
|
type: timeseries
|
|
description: HTTP QPS per Instance.
|
|
unit: reqps
|
|
queries:
|
|
- expr: sum by(instance, pod, path, method, code) (rate(greptime_servers_http_requests_elapsed_count{instance=~"$frontend",path!~"/health|/metrics"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{path}}]-[{{method}}]-[{{code}}]'
|
|
- title: HTTP P99 per Instance
|
|
type: timeseries
|
|
description: HTTP P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, path, method, code) (rate(greptime_servers_http_requests_elapsed_bucket{instance=~"$frontend",path!~"/health|/metrics"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{path}}]-[{{method}}]-[{{code}}]-p99'
|
|
- title: gRPC QPS per Instance
|
|
type: timeseries
|
|
description: gRPC QPS per Instance.
|
|
unit: reqps
|
|
queries:
|
|
- expr: sum by(instance, pod, path, code) (rate(greptime_servers_grpc_requests_elapsed_count{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{path}}]-[{{code}}]'
|
|
- title: gRPC P99 per Instance
|
|
type: timeseries
|
|
description: gRPC P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, path, code) (rate(greptime_servers_grpc_requests_elapsed_bucket{instance=~"$frontend"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{path}}]-[{{method}}]-[{{code}}]-p99'
|
|
- title: MySQL QPS per Instance
|
|
type: timeseries
|
|
description: MySQL QPS per Instance.
|
|
unit: reqps
|
|
queries:
|
|
- expr: sum by(pod, instance)(rate(greptime_servers_mysql_query_elapsed_count{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: MySQL P99 per Instance
|
|
type: timeseries
|
|
description: MySQL P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(pod, instance, le) (rate(greptime_servers_mysql_query_elapsed_bucket{instance=~"$frontend"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{ pod }}]-p99'
|
|
- title: PostgreSQL QPS per Instance
|
|
type: timeseries
|
|
description: PostgreSQL QPS per Instance.
|
|
unit: reqps
|
|
queries:
|
|
- expr: sum by(pod, instance)(rate(greptime_servers_postgres_query_elapsed_count{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: PostgreSQL P99 per Instance
|
|
type: timeseries
|
|
description: PostgreSQL P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(pod,instance,le) (rate(greptime_servers_postgres_query_elapsed_bucket{instance=~"$frontend"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-p99'
|
|
- title: Frontend to Datanode
|
|
panels:
|
|
- title: Ingest Rows per Instance
|
|
type: timeseries
|
|
description: Ingestion rate by row as in each frontend
|
|
unit: rowsps
|
|
queries:
|
|
- expr: sum by(instance, pod)(rate(greptime_table_operator_ingest_rows{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: Region Call QPS per Instance
|
|
type: timeseries
|
|
description: Region Call QPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod, request_type) (rate(greptime_grpc_region_request_count{instance=~"$frontend"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{request_type}}]'
|
|
- title: Region Call P99 per Instance
|
|
type: timeseries
|
|
description: Region Call P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, request_type) (rate(greptime_grpc_region_request_bucket{instance=~"$frontend"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{request_type}}]'
|
|
- title: 'Frontend Handle Bulk Insert Elapsed Time '
|
|
type: timeseries
|
|
description: Per-stage time for frontend to handle bulk insert requests
|
|
unit: s
|
|
queries:
|
|
- expr: sum by(instance, pod, stage) (rate(greptime_table_operator_handle_bulk_insert_sum[$__rate_interval]))/sum by(instance, pod, stage) (rate(greptime_table_operator_handle_bulk_insert_count[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-AVG'
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, stage, le) (rate(greptime_table_operator_handle_bulk_insert_bucket[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-P95'
|
|
- title: Mito Engine
|
|
panels:
|
|
- title: Request OPS per Instance
|
|
type: timeseries
|
|
description: Request QPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod, type) (rate(greptime_mito_handle_request_elapsed_count{instance=~"$datanode"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{type}}]'
|
|
- title: Request P99 per Instance
|
|
type: timeseries
|
|
description: Request P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, type) (rate(greptime_mito_handle_request_elapsed_bucket{instance=~"$datanode"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{type}}]'
|
|
- title: Write Buffer per Instance
|
|
type: timeseries
|
|
description: Write Buffer per Instance.
|
|
unit: decbytes
|
|
queries:
|
|
- expr: greptime_mito_write_buffer_bytes{instance=~"$datanode"}
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: Write Rows per Instance
|
|
type: timeseries
|
|
description: Ingestion size by row counts.
|
|
unit: rowsps
|
|
queries:
|
|
- expr: sum by (instance, pod) (rate(greptime_mito_write_rows_total{instance=~"$datanode"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: Flush OPS per Instance
|
|
type: timeseries
|
|
description: Flush QPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod, reason) (rate(greptime_mito_flush_requests_total{instance=~"$datanode"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{reason}}]'
|
|
- title: Write Stall per Instance
|
|
type: timeseries
|
|
description: Write Stall per Instance.
|
|
queries:
|
|
- expr: sum by(instance, pod) (greptime_mito_write_stall_total{instance=~"$datanode"})
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: Read Stage OPS per Instance
|
|
type: timeseries
|
|
description: Read Stage OPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod) (rate(greptime_mito_read_stage_elapsed_count{instance=~"$datanode", stage="total"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: Read Stage P99 per Instance
|
|
type: timeseries
|
|
description: Read Stage P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, stage) (rate(greptime_mito_read_stage_elapsed_bucket{instance=~"$datanode"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]'
|
|
- title: Write Stage P99 per Instance
|
|
type: timeseries
|
|
description: Write Stage P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, stage) (rate(greptime_mito_write_stage_elapsed_bucket{instance=~"$datanode"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]'
|
|
- title: Compaction OPS per Instance
|
|
type: timeseries
|
|
description: Compaction OPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod) (rate(greptime_mito_compaction_total_elapsed_count{instance=~"$datanode"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{ instance }}]-[{{pod}}]'
|
|
- title: Compaction Elapsed Time per Instance by Stage
|
|
type: timeseries
|
|
description: Compaction latency by stage
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, stage) (rate(greptime_mito_compaction_stage_elapsed_bucket{instance=~"$datanode"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-p99'
|
|
- expr: sum by(instance, pod, stage) (rate(greptime_mito_compaction_stage_elapsed_sum{instance=~"$datanode"}[$__rate_interval]))/sum by(instance, pod, stage) (rate(greptime_mito_compaction_stage_elapsed_count{instance=~"$datanode"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-avg'
|
|
- title: Compaction P99 per Instance
|
|
type: timeseries
|
|
description: Compaction P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le,stage) (rate(greptime_mito_compaction_total_elapsed_bucket{instance=~"$datanode"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-compaction'
|
|
- title: WAL write size
|
|
type: timeseries
|
|
description: Write-ahead logs write size as bytes. This chart includes stats of p95 and p99 size by instance, total WAL write rate.
|
|
unit: bytes
|
|
queries:
|
|
- expr: histogram_quantile(0.95, sum by(le,instance, pod) (rate(raft_engine_write_size_bucket[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-req-size-p95'
|
|
- expr: histogram_quantile(0.99, sum by(le,instance,pod) (rate(raft_engine_write_size_bucket[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-req-size-p99'
|
|
- expr: sum by (instance, pod)(rate(raft_engine_write_size_sum[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-throughput'
|
|
- title: Cached Bytes per Instance
|
|
type: timeseries
|
|
description: Cached Bytes per Instance.
|
|
unit: decbytes
|
|
queries:
|
|
- expr: greptime_mito_cache_bytes{instance=~"$datanode"}
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{type}}]'
|
|
- title: Inflight Compaction
|
|
type: timeseries
|
|
description: Ongoing compaction task count
|
|
unit: none
|
|
queries:
|
|
- expr: greptime_mito_inflight_compaction_count
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: WAL sync duration seconds
|
|
type: timeseries
|
|
description: Raft engine (local disk) log store sync latency, p99
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(le, type, node, instance, pod) (rate(raft_engine_sync_log_duration_seconds_bucket[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-p99'
|
|
- title: Log Store op duration seconds
|
|
type: timeseries
|
|
description: Write-ahead log operations latency at p99
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(le,logstore,optype,instance, pod) (rate(greptime_logstore_op_elapsed_bucket[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{logstore}}]-[{{optype}}]-p99'
|
|
- title: Inflight Flush
|
|
type: timeseries
|
|
description: Ongoing flush task count
|
|
unit: none
|
|
queries:
|
|
- expr: greptime_mito_inflight_flush_count
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]'
|
|
- title: Compaction Input/Output Bytes
|
|
type: timeseries
|
|
description: Compaction oinput output bytes
|
|
unit: bytes
|
|
queries:
|
|
- expr: sum by(instance, pod) (greptime_mito_compaction_input_bytes)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-input'
|
|
- expr: sum by(instance, pod) (greptime_mito_compaction_output_bytes)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-output'
|
|
- title: Region Worker Handle Bulk Insert Requests
|
|
type: timeseries
|
|
description: Per-stage elapsed time for region worker to handle bulk insert region requests.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.95, sum by(le,instance, stage, pod) (rate(greptime_region_worker_handle_write_bucket[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-P95'
|
|
- expr: sum by(instance, stage, pod) (rate(greptime_region_worker_handle_write_sum[$__rate_interval]))/sum by(instance, stage, pod) (rate(greptime_region_worker_handle_write_count[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-AVG'
|
|
- title: Active Series and Field Builders Count
|
|
type: timeseries
|
|
description: Compaction oinput output bytes
|
|
unit: none
|
|
queries:
|
|
- expr: sum by(instance, pod) (greptime_mito_memtable_active_series_count)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-series'
|
|
- expr: sum by(instance, pod) (greptime_mito_memtable_field_builder_count)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-field_builders'
|
|
- title: Region Worker Convert Requests
|
|
type: timeseries
|
|
description: Per-stage elapsed time for region worker to decode requests.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.95, sum by(le, instance, stage, pod) (rate(greptime_datanode_convert_region_request_bucket[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-P95'
|
|
- expr: sum by(le,instance, stage, pod) (rate(greptime_datanode_convert_region_request_sum[$__rate_interval]))/sum by(le,instance, stage, pod) (rate(greptime_datanode_convert_region_request_count[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{stage}}]-AVG'
|
|
- title: Cache Miss
|
|
type: timeseries
|
|
description: The local cache miss of the datanode.
|
|
queries:
|
|
- expr: sum by (instance,pod, type) (rate(greptime_mito_cache_miss{instance=~"$datanode"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{type}}]'
|
|
- title: OpenDAL
|
|
panels:
|
|
- title: QPS per Instance
|
|
type: timeseries
|
|
description: QPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
|
|
- title: Read QPS per Instance
|
|
type: timeseries
|
|
description: Read QPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation=~"read|Reader::read"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
|
|
- title: Read P99 per Instance
|
|
type: timeseries
|
|
description: Read P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode",operation=~"read|Reader::read"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
|
|
- title: Write QPS per Instance
|
|
type: timeseries
|
|
description: Write QPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation=~"write|Writer::write|Writer::close"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
|
|
- title: Write P99 per Instance
|
|
type: timeseries
|
|
description: Write P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation =~ "Writer::write|Writer::close|write"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
|
|
- title: List QPS per Instance
|
|
type: timeseries
|
|
description: List QPS per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod, scheme) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode", operation="list"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]'
|
|
- title: List P99 per Instance
|
|
type: timeseries
|
|
description: List P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation="list"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]'
|
|
- title: Other Requests per Instance
|
|
type: timeseries
|
|
description: Other Requests per Instance.
|
|
unit: ops
|
|
queries:
|
|
- expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_duration_seconds_count{instance=~"$datanode",operation!~"read|write|list|stat"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
|
|
- title: Other Request P99 per Instance
|
|
type: timeseries
|
|
description: Other Request P99 per Instance.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(instance, pod, le, scheme, operation) (rate(opendal_operation_duration_seconds_bucket{instance=~"$datanode", operation!~"read|write|list|Writer::write|Writer::close|Reader::read"}[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
|
|
- title: Opendal traffic
|
|
type: timeseries
|
|
description: Total traffic as in bytes by instance and operation
|
|
unit: decbytes
|
|
queries:
|
|
- expr: sum by(instance, pod, scheme, operation) (rate(opendal_operation_bytes_sum{instance=~"$datanode"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]'
|
|
- title: OpenDAL errors per Instance
|
|
type: timeseries
|
|
description: OpenDAL error counts per Instance.
|
|
queries:
|
|
- expr: sum by(instance, pod, scheme, operation, error) (rate(opendal_operation_errors_total{instance=~"$datanode", error!="NotFound"}[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{scheme}}]-[{{operation}}]-[{{error}}]'
|
|
- title: Remote WAL
|
|
panels:
|
|
- title: Triggered region flush total
|
|
type: timeseries
|
|
description: Triggered region flush total
|
|
unit: none
|
|
queries:
|
|
- expr: meta_triggered_region_flush_total
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}-{{topic_name}}'
|
|
- title: Triggered region checkpoint total
|
|
type: timeseries
|
|
description: Triggered region checkpoint total
|
|
unit: none
|
|
queries:
|
|
- expr: meta_triggered_region_checkpoint_total
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}-{{topic_name}}'
|
|
- title: Topic estimated replay size
|
|
type: timeseries
|
|
description: Topic estimated max replay size
|
|
unit: bytes
|
|
queries:
|
|
- expr: meta_topic_estimated_replay_size
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}-{{topic_name}}'
|
|
- title: Kafka logstore's bytes traffic
|
|
type: timeseries
|
|
description: Kafka logstore's bytes traffic
|
|
unit: bytes
|
|
queries:
|
|
- expr: rate(greptime_logstore_kafka_client_bytes_total[$__rate_interval])
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}-{{logstore}}'
|
|
- title: Metasrv
|
|
panels:
|
|
- title: Region migration datanode
|
|
type: status-history
|
|
description: Counter of region migration by source and destination
|
|
queries:
|
|
- expr: greptime_meta_region_migration_stat{datanode_type="src"}
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: from-datanode-{{datanode_id}}
|
|
- expr: greptime_meta_region_migration_stat{datanode_type="desc"}
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: to-datanode-{{datanode_id}}
|
|
- title: Region migration error
|
|
type: timeseries
|
|
description: Counter of region migration error
|
|
unit: none
|
|
queries:
|
|
- expr: greptime_meta_region_migration_error
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}-{{state}}-{{error_type}}'
|
|
- title: Datanode load
|
|
type: timeseries
|
|
description: Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads.
|
|
unit: binBps
|
|
queries:
|
|
- expr: greptime_datanode_load
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: Datanode-{{datanode_id}}-writeload
|
|
- title: Rate of SQL Executions (RDS)
|
|
type: timeseries
|
|
description: Displays the rate of SQL executions processed by the Meta service using the RDS backend.
|
|
unit: none
|
|
queries:
|
|
- expr: rate(greptime_meta_rds_pg_sql_execute_elapsed_ms_count[$__rate_interval])
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}} {{op}} {{type}} {{result}} '
|
|
- title: SQL Execution Latency (RDS)
|
|
type: timeseries
|
|
description: 'Measures the response time of SQL executions via the RDS backend. '
|
|
unit: ms
|
|
queries:
|
|
- expr: histogram_quantile(0.90, sum by(pod, op, type, result, le) (rate(greptime_meta_rds_pg_sql_execute_elapsed_ms_bucket[$__rate_interval])))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}} {{op}} {{type}} {{result}} p90'
|
|
- title: Handler Execution Latency
|
|
type: timeseries
|
|
description: |
|
|
Shows latency of Meta handlers by pod and handler name, useful for monitoring handler performance and detecting latency spikes.
|
|
unit: s
|
|
queries:
|
|
- expr: |-
|
|
histogram_quantile(0.90, sum by(pod, le, name) (
|
|
rate(greptime_meta_handler_execute_bucket[$__rate_interval])
|
|
))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}} {{name}} p90'
|
|
- title: Heartbeat Packet Size
|
|
type: timeseries
|
|
description: |
|
|
Shows p90 heartbeat message sizes, helping track network usage and identify anomalies in heartbeat payload.
|
|
unit: bytes
|
|
queries:
|
|
- expr: histogram_quantile(0.9, sum by(pod, le) (greptime_meta_heartbeat_stat_memory_size_bucket))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}'
|
|
- title: Meta Heartbeat Receive Rate
|
|
type: timeseries
|
|
description: Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads.
|
|
unit: s
|
|
queries:
|
|
- expr: rate(greptime_meta_heartbeat_rate[$__rate_interval])
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}'
|
|
- title: Meta KV Ops Latency
|
|
type: timeseries
|
|
description: Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.99, sum by(pod, le, op, target) (greptime_meta_kv_request_elapsed_bucket))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}-{{op}} p99'
|
|
- title: Rate of meta KV Ops
|
|
type: timeseries
|
|
description: Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads.
|
|
unit: none
|
|
queries:
|
|
- expr: rate(greptime_meta_kv_request_elapsed_count[$__rate_interval])
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}-{{op}} p99'
|
|
- title: DDL Latency
|
|
type: timeseries
|
|
description: Gauge of load information of each datanode, collected via heartbeat between datanode and metasrv. This information is for metasrv to schedule workloads.
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_tables_bucket))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: CreateLogicalTables-{{step}} p90
|
|
- expr: histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_table))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: CreateTable-{{step}} p90
|
|
- expr: histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_view))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: CreateView-{{step}} p90
|
|
- expr: histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_create_flow))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: CreateFlow-{{step}} p90
|
|
- expr: histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_drop_table))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: DropTable-{{step}} p90
|
|
- expr: histogram_quantile(0.9, sum by(le, pod, step) (greptime_meta_procedure_alter_table))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: AlterTable-{{step}} p90
|
|
- title: Reconciliation stats
|
|
type: timeseries
|
|
description: Reconciliation stats
|
|
unit: s
|
|
queries:
|
|
- expr: greptime_meta_reconciliation_stats
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{pod}}-{{table_type}}-{{type}}'
|
|
- title: Reconciliation steps
|
|
type: timeseries
|
|
description: 'Elapsed of Reconciliation steps '
|
|
unit: s
|
|
queries:
|
|
- expr: histogram_quantile(0.9, greptime_meta_reconciliation_procedure_bucket)
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '{{procedure_name}}-{{step}}-P90'
|
|
- title: Flownode
|
|
panels:
|
|
- title: Flow Ingest / Output Rate
|
|
type: timeseries
|
|
description: Flow Ingest / Output Rate.
|
|
queries:
|
|
- expr: sum by(instance, pod, direction) (rate(greptime_flow_processed_rows[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{pod}}]-[{{instance}}]-[{{direction}}]'
|
|
- title: Flow Ingest Latency
|
|
type: timeseries
|
|
description: Flow Ingest Latency.
|
|
queries:
|
|
- expr: histogram_quantile(0.95, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-p95'
|
|
- expr: histogram_quantile(0.99, sum(rate(greptime_flow_insert_elapsed_bucket[$__rate_interval])) by (le, instance, pod))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-p99'
|
|
- title: Flow Operation Latency
|
|
type: timeseries
|
|
description: Flow Operation Latency.
|
|
queries:
|
|
- expr: histogram_quantile(0.95, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{type}}]-p95'
|
|
- expr: histogram_quantile(0.99, sum(rate(greptime_flow_processing_time_bucket[$__rate_interval])) by (le,instance,pod,type))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{type}}]-p99'
|
|
- title: Flow Buffer Size per Instance
|
|
type: timeseries
|
|
description: Flow Buffer Size per Instance.
|
|
queries:
|
|
- expr: greptime_flow_input_buf_size
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}]'
|
|
- title: Flow Processing Error per Instance
|
|
type: timeseries
|
|
description: Flow Processing Error per Instance.
|
|
queries:
|
|
- expr: sum by(instance,pod,code) (rate(greptime_flow_errors[$__rate_interval]))
|
|
datasource:
|
|
type: prometheus
|
|
uid: ${metrics}
|
|
legendFormat: '[{{instance}}]-[{{pod}}]-[{{code}}]'
|