Fix pageserver metrics names (#1682)

Try to follow Prometheus style-guide https://prometheus.io/docs/practices/naming/ for metrics names. More specifically:
- Use `pageserver_` prefix for all pagserver metrics
- Specify `_seconds` unit in time metrics
- Use unit as a suffix in other cases, such as `_hits`, `_bytes`, `_records`
- Use `_total` suffix for accumulating counters (note that Histograms append that suffix internally)
This commit is contained in:
Arthur Petukhovsky
2022-05-12 19:53:07 +03:00
committed by GitHub
parent 4538f1e1b8
commit ec8861b8cc
7 changed files with 20 additions and 20 deletions

View File

@@ -89,7 +89,7 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme
// Metrics collected on operations on the storage repository.
lazy_static! {
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
"pageserver_storage_time",
"pageserver_storage_operations_seconds",
"Time spent on storage operations",
&["operation", "tenant_id", "timeline_id"]
)
@@ -99,8 +99,8 @@ lazy_static! {
// Metrics collected on operations on the storage repository.
lazy_static! {
static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
"pageserver_getpage_reconstruct_time",
"Time spent on storage operations",
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric");
@@ -108,13 +108,13 @@ lazy_static! {
lazy_static! {
static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
"materialize_page_cache_hits",
"pageserver_materialized_cache_hits_total",
"Number of cache hits from materialized page cache",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric");
static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!(
"wait_lsn_time",
"pageserver_wait_lsn_seconds",
"Time spent waiting for WAL to arrive",
&["tenant_id", "timeline_id"]
)
@@ -134,12 +134,12 @@ lazy_static! {
// or in testing they estimate how much we would upload if we did.
lazy_static! {
static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!(
"pageserver_num_persistent_files_created",
"pageserver_created_persistent_files_total",
"Number of files created that are meant to be uploaded to cloud storage",
)
.expect("failed to define a metric");
static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!(
"pageserver_persistent_bytes_written",
"pageserver_written_persistent_bytes_total",
"Total bytes written that are meant to be uploaded to cloud storage",
)
.expect("failed to define a metric");

View File

@@ -45,7 +45,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
lazy_static! {
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
"pageserver_live_connections_count",
"pageserver_live_connections",
"Number of live network connections",
&["pageserver_connection_kind"]
)

View File

@@ -326,7 +326,7 @@ const TIME_BUCKETS: &[f64] = &[
lazy_static! {
static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
"pageserver_smgr_query_time",
"pageserver_smgr_query_seconds",
"Time spent on smgr query handling",
&["smgr_query_type", "tenant_id", "timeline_id"],
TIME_BUCKETS.into()

View File

@@ -208,12 +208,12 @@ lazy_static! {
)
.expect("failed to register pageserver remote storage remaining sync items int gauge");
static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!(
"pageserver_remote_storage_fatal_task_failures",
"pageserver_remote_storage_fatal_task_failures_total",
"Number of critically failed tasks"
)
.expect("failed to register pageserver remote storage remaining sync items int gauge");
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
"pageserver_remote_storage_image_sync_time",
"pageserver_remote_storage_image_sync_seconds",
"Time took to synchronize (download or upload) a whole pageserver image. \
Grouped by `operation_kind` (upload|download) and `status` (success|failure)",
&["operation_kind", "status"],

View File

@@ -34,7 +34,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
lazy_static! {
static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
"pageserver_io_time",
"pageserver_io_operations_seconds",
"Time spent in IO operations",
&["operation", "tenant_id", "timeline_id"],
STORAGE_IO_TIME_BUCKETS.into()
@@ -43,8 +43,8 @@ lazy_static! {
}
lazy_static! {
static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
"pageserver_io_size",
"Amount of bytes",
"pageserver_io_operations_bytes_total",
"Total amount of bytes read/written in IO operations",
&["operation", "tenant_id", "timeline_id"]
)
.expect("failed to define a metric");

View File

@@ -106,16 +106,16 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
// each tenant.
lazy_static! {
static ref WAL_REDO_TIME: Histogram =
register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo")
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
.expect("failed to define a metric");
static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
"pageserver_wal_redo_wait_time",
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the WAL redo process"
)
.expect("failed to define a metric");
static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
"pageserver_wal_records_replayed",
"Number of WAL records replayed"
"pageserver_replayed_wal_records_total",
"Number of WAL records replayed in WAL redo process"
)
.unwrap();
}

View File

@@ -106,9 +106,9 @@ class ZenithCompare(PgCompare):
report=MetricReport.LOWER_IS_BETTER)
total_files = self.zenbenchmark.get_int_counter_value(
self.env.pageserver, "pageserver_num_persistent_files_created")
self.env.pageserver, "pageserver_created_persistent_files_total")
total_bytes = self.zenbenchmark.get_int_counter_value(
self.env.pageserver, "pageserver_persistent_bytes_written")
self.env.pageserver, "pageserver_written_persistent_bytes_total")
self.zenbenchmark.record("data_uploaded",
total_bytes / (1024 * 1024),
"MB",