From ec8861b8cc54f61d509925b67babc1af765c37ef Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 12 May 2022 19:53:07 +0300 Subject: [PATCH] Fix pageserver metrics names (#1682) Try to follow Prometheus style-guide https://prometheus.io/docs/practices/naming/ for metrics names. More specifically: - Use `pageserver_` prefix for all pagserver metrics - Specify `_seconds` unit in time metrics - Use unit as a suffix in other cases, such as `_hits`, `_bytes`, `_records` - Use `_total` suffix for accumulating counters (note that Histograms append that suffix internally) --- pageserver/src/layered_repository.rs | 14 +++++++------- pageserver/src/lib.rs | 2 +- pageserver/src/page_service.rs | 2 +- pageserver/src/storage_sync.rs | 4 ++-- pageserver/src/virtual_file.rs | 6 +++--- pageserver/src/walredo.rs | 8 ++++---- test_runner/fixtures/compare_fixtures.py | 4 ++-- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6a614e184f..b02ab00a21 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -89,7 +89,7 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme // Metrics collected on operations on the storage repository. lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( - "pageserver_storage_time", + "pageserver_storage_operations_seconds", "Time spent on storage operations", &["operation", "tenant_id", "timeline_id"] ) @@ -99,8 +99,8 @@ lazy_static! { // Metrics collected on operations on the storage repository. lazy_static! { static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( - "pageserver_getpage_reconstruct_time", - "Time spent on storage operations", + "pageserver_getpage_reconstruct_seconds", + "Time spent in reconstruct_value", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); @@ -108,13 +108,13 @@ lazy_static! { lazy_static! { static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!( - "materialize_page_cache_hits", + "pageserver_materialized_cache_hits_total", "Number of cache hits from materialized page cache", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!( - "wait_lsn_time", + "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", &["tenant_id", "timeline_id"] ) @@ -134,12 +134,12 @@ lazy_static! { // or in testing they estimate how much we would upload if we did. lazy_static! { static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!( - "pageserver_num_persistent_files_created", + "pageserver_created_persistent_files_total", "Number of files created that are meant to be uploaded to cloud storage", ) .expect("failed to define a metric"); static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!( - "pageserver_persistent_bytes_written", + "pageserver_written_persistent_bytes_total", "Total bytes written that are meant to be uploaded to cloud storage", ) .expect("failed to define a metric"); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 83985069ec..fdce0e5c5f 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -45,7 +45,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; lazy_static! { static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( - "pageserver_live_connections_count", + "pageserver_live_connections", "Number of live network connections", &["pageserver_connection_kind"] ) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index da3dedfc84..88273cfa57 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -326,7 +326,7 @@ const TIME_BUCKETS: &[f64] = &[ lazy_static! { static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( - "pageserver_smgr_query_time", + "pageserver_smgr_query_seconds", "Time spent on smgr query handling", &["smgr_query_type", "tenant_id", "timeline_id"], TIME_BUCKETS.into() diff --git a/pageserver/src/storage_sync.rs b/pageserver/src/storage_sync.rs index b8c6f7fdab..7755e67c8d 100644 --- a/pageserver/src/storage_sync.rs +++ b/pageserver/src/storage_sync.rs @@ -208,12 +208,12 @@ lazy_static! { ) .expect("failed to register pageserver remote storage remaining sync items int gauge"); static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!( - "pageserver_remote_storage_fatal_task_failures", + "pageserver_remote_storage_fatal_task_failures_total", "Number of critically failed tasks" ) .expect("failed to register pageserver remote storage remaining sync items int gauge"); static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!( - "pageserver_remote_storage_image_sync_time", + "pageserver_remote_storage_image_sync_seconds", "Time took to synchronize (download or upload) a whole pageserver image. \ Grouped by `operation_kind` (upload|download) and `status` (success|failure)", &["operation_kind", "status"], diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 4ce245a74f..37d70372b5 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -34,7 +34,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ lazy_static! { static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!( - "pageserver_io_time", + "pageserver_io_operations_seconds", "Time spent in IO operations", &["operation", "tenant_id", "timeline_id"], STORAGE_IO_TIME_BUCKETS.into() @@ -43,8 +43,8 @@ lazy_static! { } lazy_static! { static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!( - "pageserver_io_size", - "Amount of bytes", + "pageserver_io_operations_bytes_total", + "Total amount of bytes read/written in IO operations", &["operation", "tenant_id", "timeline_id"] ) .expect("failed to define a metric"); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 777718b311..e556c24548 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -106,16 +106,16 @@ impl crate::walredo::WalRedoManager for DummyRedoManager { // each tenant. lazy_static! { static ref WAL_REDO_TIME: Histogram = - register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo") + register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo") .expect("failed to define a metric"); static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!( - "pageserver_wal_redo_wait_time", + "pageserver_wal_redo_wait_seconds", "Time spent waiting for access to the WAL redo process" ) .expect("failed to define a metric"); static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!( - "pageserver_wal_records_replayed", - "Number of WAL records replayed" + "pageserver_replayed_wal_records_total", + "Number of WAL records replayed in WAL redo process" ) .unwrap(); } diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index d70f57aa52..d572901ed1 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -106,9 +106,9 @@ class ZenithCompare(PgCompare): report=MetricReport.LOWER_IS_BETTER) total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_num_persistent_files_created") + self.env.pageserver, "pageserver_created_persistent_files_total") total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_persistent_bytes_written") + self.env.pageserver, "pageserver_written_persistent_bytes_total") self.zenbenchmark.record("data_uploaded", total_bytes / (1024 * 1024), "MB",