diff --git a/.gitignore b/.gitignore index 972e4df6a0..c906e5b6b8 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ debug/ # Logs **/__unittest_logs logs/ +!grafana/dashboards/logs/ # cpython's generated python byte code **/__pycache__/ diff --git a/grafana/README.md b/grafana/README.md index db86581e0b..d195a28046 100644 --- a/grafana/README.md +++ b/grafana/README.md @@ -2,30 +2,63 @@ ## Overview -This repository maintains the Grafana dashboards for GreptimeDB. It has two types of dashboards: +This repository contains Grafana dashboards for visualizing metrics and logs of GreptimeDB instances running in either cluster or standalone mode. **The Grafana version should be greater than 9.0**. -- `cluster/dashboard.json`: The Grafana dashboard for the GreptimeDB cluster. Read the [dashboard.md](./dashboards/cluster/dashboard.md) for more details. -- `standalone/dashboard.json`: The Grafana dashboard for the standalone GreptimeDB instance. **It's generated from the `cluster/dashboard.json` by removing the instance filter through the `make dashboards` command**. Read the [dashboard.md](./dashboards/standalone/dashboard.md) for more details. +We highly recommend using the self-monitoring feature provided by [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator) to automatically collect metrics and logs from your GreptimeDB instances and store them in a dedicated GreptimeDB instance. -As the rapid development of GreptimeDB, the metrics may be changed, and please feel free to submit your feedback and/or contribution to this dashboard 🤗 +- **Metrics Dashboards** -**NOTE**: + - `dashboards/metrics/cluster/dashboard.json`: The Grafana dashboard for the GreptimeDB cluster. Read the [dashboard.md](./dashboards/metrics/cluster/dashboard.md) for more details. + + - `dashboards/metrics/standalone/dashboard.json`: The Grafana dashboard for the standalone GreptimeDB instance. **It's generated from the `cluster/dashboard.json` by removing the instance filter through the `make dashboards` command**. Read the [dashboard.md](./dashboards/metrics/standalone/dashboard.md) for more details. -- The Grafana version should be greater than 9.0. +- **Logs Dashboard** -- If you want to modify the dashboards, you only need to modify the `cluster/dashboard.json` and run the `make dashboards` command to generate the `standalone/dashboard.json` and other related files. + The `dashboards/logs/dashboard.json` provides a comprehensive Grafana dashboard for visualizing GreptimeDB logs. To utilize this dashboard effectively, you need to collect logs in JSON format from your GreptimeDB instances and store them in a dedicated GreptimeDB instance. -To maintain the dashboards easily, we use the [`dac`](https://github.com/zyy17/dac) tool to generate the intermediate dashboards and markdown documents: + For proper integration, the logs table must adhere to the following schema design with the table name `_gt_logs`: -- `cluster/dashboard.yaml`: The intermediate dashboard for the GreptimeDB cluster. -- `standalone/dashboard.yaml`: The intermediate dashboard for the standalone GreptimeDB instance. + ```sql + CREATE TABLE IF NOT EXISTS `_gt_logs` ( + `pod_ip` STRING NULL, + `namespace` STRING NULL, + `cluster` STRING NULL, + `file` STRING NULL, + `module_path` STRING NULL, + `level` STRING NULL, + `target` STRING NULL, + `role` STRING NULL, + `pod` STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'), + `message` STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false'), + `err` STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false'), + `timestamp` TIMESTAMP(9) NOT NULL, + TIME INDEX (`timestamp`), + PRIMARY KEY (`level`, `target`, `role`) + ) + ENGINE=mito + WITH ( + append_mode = 'true' + ) + ``` + +## Development + +As GreptimeDB evolves rapidly, metrics may change over time. We welcome your feedback and contributions to improve these dashboards 🤗 + +To modify the metrics dashboards, simply edit the `dashboards/metrics/cluster/dashboard.json` file and run the `make dashboards` command. This will automatically generate the updated `dashboards/metrics/standalone/dashboard.json` and other related files. + +For easier dashboard maintenance, we utilize the [`dac`](https://github.com/zyy17/dac) tool to generate human-readable intermediate dashboards and documentation: + +- `dashboards/metrics/cluster/dashboard.yaml`: The intermediate dashboard file for the GreptimeDB cluster. +- `dashboards/metrics/standalone/dashboard.yaml`: The intermediate dashboard file for standalone GreptimeDB instances. ## Data Sources -There are two data sources for the dashboards to fetch the metrics: +The following data sources are used to fetch metrics and logs: -- **Prometheus**: Expose the metrics of GreptimeDB. -- **Information Schema**: It is the MySQL port of the current monitored instance. The `overview` dashboard will use this datasource to show the information schema of the current instance. +- **`${metrics}`**: Prometheus data source for providing the GreptimeDB metrics. +- **`${logs}`**: MySQL data source for providing the GreptimeDB logs. +- **`${information_schema}`**: MySQL data source for providing the information schema of the current instance and used for the `overview` panel. It is the MySQL port of the current monitored instance. ## Instance Filters @@ -43,9 +76,9 @@ And the legend will be like: `[{{instance}}]-[{{ pod }}]`. ## Deployment -### Helm +### (Recommended) Helm Chart -If you use the Helm [chart](https://github.com/GreptimeTeam/helm-charts) to deploy a GreptimeDB cluster, you can enable self-monitoring by setting the following values in your Helm chart: +If you use the [Helm Chart](https://github.com/GreptimeTeam/helm-charts) to deploy a GreptimeDB cluster, you can enable self-monitoring by setting the following values in your Helm chart: - `monitoring.enabled=true`: Deploys a standalone GreptimeDB instance dedicated to monitoring the cluster; - `grafana.enabled=true`: Deploys Grafana and automatically imports the monitoring dashboard; @@ -85,5 +118,5 @@ The standalone GreptimeDB instance will collect metrics from your cluster, and t 3. **Import the dashboards based on your deployment scenario** - - **Cluster**: Import the `cluster/dashboard.json` dashboard. - - **Standalone**: Import the `standalone/dashboard.json` dashboard. + - **Cluster**: Import the `dashboards/metrics/cluster/dashboard.json` dashboard. + - **Standalone**: Import the `dashboards/metrics/standalone/dashboard.json` dashboard. diff --git a/grafana/dashboards/logs/dashboard.json b/grafana/dashboards/logs/dashboard.json new file mode 100644 index 0000000000..9dba6bab2b --- /dev/null +++ b/grafana/dashboards/logs/dashboard.json @@ -0,0 +1,292 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 12, + "links": [], + "panels": [ + { + "datasource": { + "default": false, + "type": "mysql", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 20, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "dedupStrategy": "none", + "enableInfiniteScrolling": true, + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "11.6.0", + "targets": [ + { + "dataset": "greptime_private", + "datasource": { + "type": "mysql", + "uid": "${datasource}" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT `timestamp`, CONCAT('[', `level`, ']', ' ', '<', `target`, '>', ' ', `message`),\n `role`,\n `pod`,\n `pod_ip`,\n `namespace`,\n `cluster`,\n `err`,\n `file`,\n `module_path`\nFROM\n `_gt_logs`\nWHERE\n (\n \"$level\" = \"'all'\"\n OR `level` IN ($level)\n ) \n AND (\n \"$role\" = \"'all'\"\n OR `role` IN ($role)\n )\n AND (\n \"$pod\" = \"\"\n OR `pod` = '$pod'\n )\n AND (\n \"$target\" = \"\"\n OR `target` = '$target'\n )\n AND (\n \"$search\" = \"\"\n OR matches_term(`message`, '$search')\n )\n AND (\n \"$exclude\" = \"\"\n OR NOT matches_term(`message`, '$exclude')\n )\n AND $__timeFilter(`timestamp`)\nORDER BY `timestamp` DESC\nLIMIT $limit;\n", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Logs", + "type": "logs" + } + ], + "preload": false, + "refresh": "", + "schemaVersion": 41, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "logs", + "value": "P98F38F12DB221A8C" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "mysql", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": "'all'", + "current": { + "text": [ + "$__all" + ], + "value": [ + "$__all" + ] + }, + "includeAll": true, + "label": "level", + "multi": true, + "name": "level", + "options": [ + { + "selected": false, + "text": "INFO", + "value": "INFO" + }, + { + "selected": false, + "text": "ERROR", + "value": "ERROR" + }, + { + "selected": false, + "text": "WARN", + "value": "WARN" + }, + { + "selected": false, + "text": "DEBUG", + "value": "DEBUG" + }, + { + "selected": false, + "text": "TRACE", + "value": "TRACE" + } + ], + "query": "INFO,ERROR,WARN,DEBUG,TRACE", + "type": "custom" + }, + { + "allValue": "'all'", + "current": { + "text": [ + "$__all" + ], + "value": [ + "$__all" + ] + }, + "includeAll": true, + "label": "role", + "multi": true, + "name": "role", + "options": [ + { + "selected": false, + "text": "datanode", + "value": "datanode" + }, + { + "selected": false, + "text": "frontend", + "value": "frontend" + }, + { + "selected": false, + "text": "meta", + "value": "meta" + } + ], + "query": "datanode,frontend,meta", + "type": "custom" + }, + { + "current": { + "text": "", + "value": "" + }, + "label": "pod", + "name": "pod", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + }, + { + "current": { + "text": "", + "value": "" + }, + "label": "target", + "name": "target", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + }, + { + "current": { + "text": "", + "value": "" + }, + "label": "search", + "name": "search", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + }, + { + "current": { + "text": "", + "value": "" + }, + "label": "exclude", + "name": "exclude", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + }, + { + "current": { + "text": "2000", + "value": "2000" + }, + "includeAll": false, + "label": "limit", + "name": "limit", + "options": [ + { + "selected": true, + "text": "2000", + "value": "2000" + }, + { + "selected": false, + "text": "5000", + "value": "5000" + }, + { + "selected": false, + "text": "8000", + "value": "8000" + } + ], + "query": "2000,5000,8000", + "type": "custom" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "GreptimeDB Logs", + "uid": "edx5veo4rd3wge2", + "version": 1 +} diff --git a/grafana/dashboards/cluster/dashboard.json b/grafana/dashboards/metrics/cluster/dashboard.json similarity index 100% rename from grafana/dashboards/cluster/dashboard.json rename to grafana/dashboards/metrics/cluster/dashboard.json diff --git a/grafana/dashboards/cluster/dashboard.md b/grafana/dashboards/metrics/cluster/dashboard.md similarity index 100% rename from grafana/dashboards/cluster/dashboard.md rename to grafana/dashboards/metrics/cluster/dashboard.md diff --git a/grafana/dashboards/cluster/dashboard.yaml b/grafana/dashboards/metrics/cluster/dashboard.yaml similarity index 100% rename from grafana/dashboards/cluster/dashboard.yaml rename to grafana/dashboards/metrics/cluster/dashboard.yaml diff --git a/grafana/dashboards/standalone/dashboard.json b/grafana/dashboards/metrics/standalone/dashboard.json similarity index 100% rename from grafana/dashboards/standalone/dashboard.json rename to grafana/dashboards/metrics/standalone/dashboard.json diff --git a/grafana/dashboards/standalone/dashboard.md b/grafana/dashboards/metrics/standalone/dashboard.md similarity index 100% rename from grafana/dashboards/standalone/dashboard.md rename to grafana/dashboards/metrics/standalone/dashboard.md diff --git a/grafana/dashboards/standalone/dashboard.yaml b/grafana/dashboards/metrics/standalone/dashboard.yaml similarity index 100% rename from grafana/dashboards/standalone/dashboard.yaml rename to grafana/dashboards/metrics/standalone/dashboard.yaml diff --git a/grafana/scripts/check.sh b/grafana/scripts/check.sh index 78d133e105..cf2924783a 100755 --- a/grafana/scripts/check.sh +++ b/grafana/scripts/check.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -DASHBOARD_DIR=${1:-grafana/dashboards} +DASHBOARD_DIR=${1:-grafana/dashboards/metrics} check_dashboard_description() { for dashboard in $(find $DASHBOARD_DIR -name "*.json"); do @@ -25,7 +25,7 @@ check_dashboard_description() { check_dashboards_generation() { ./grafana/scripts/gen-dashboards.sh - if [[ -n "$(git diff --name-only grafana/dashboards)" ]]; then + if [[ -n "$(git diff --name-only grafana/dashboards/metrics)" ]]; then echo "Error: The dashboards are not generated correctly. You should execute the `make dashboards` command." exit 1 fi diff --git a/grafana/scripts/gen-dashboards.sh b/grafana/scripts/gen-dashboards.sh index 9488986bf9..891eb7f7f5 100755 --- a/grafana/scripts/gen-dashboards.sh +++ b/grafana/scripts/gen-dashboards.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -CLUSTER_DASHBOARD_DIR=${1:-grafana/dashboards/cluster} -STANDALONE_DASHBOARD_DIR=${2:-grafana/dashboards/standalone} +CLUSTER_DASHBOARD_DIR=${1:-grafana/dashboards/metrics/cluster} +STANDALONE_DASHBOARD_DIR=${2:-grafana/dashboards/metrics/standalone} DAC_IMAGE=ghcr.io/zyy17/dac:20250423-522bd35 remove_instance_filters() {