mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 18:10:37 +00:00
Fix pageserver metrics names (#1682)
Try to follow Prometheus style-guide https://prometheus.io/docs/practices/naming/ for metrics names. More specifically: - Use `pageserver_` prefix for all pagserver metrics - Specify `_seconds` unit in time metrics - Use unit as a suffix in other cases, such as `_hits`, `_bytes`, `_records` - Use `_total` suffix for accumulating counters (note that Histograms append that suffix internally)
This commit is contained in:
committed by
GitHub
parent
4538f1e1b8
commit
ec8861b8cc
@@ -89,7 +89,7 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme
|
||||
// Metrics collected on operations on the storage repository.
|
||||
lazy_static! {
|
||||
static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_storage_time",
|
||||
"pageserver_storage_operations_seconds",
|
||||
"Time spent on storage operations",
|
||||
&["operation", "tenant_id", "timeline_id"]
|
||||
)
|
||||
@@ -99,8 +99,8 @@ lazy_static! {
|
||||
// Metrics collected on operations on the storage repository.
|
||||
lazy_static! {
|
||||
static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_getpage_reconstruct_time",
|
||||
"Time spent on storage operations",
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
@@ -108,13 +108,13 @@ lazy_static! {
|
||||
|
||||
lazy_static! {
|
||||
static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
|
||||
"materialize_page_cache_hits",
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"Number of cache hits from materialized page cache",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!(
|
||||
"wait_lsn_time",
|
||||
"pageserver_wait_lsn_seconds",
|
||||
"Time spent waiting for WAL to arrive",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
@@ -134,12 +134,12 @@ lazy_static! {
|
||||
// or in testing they estimate how much we would upload if we did.
|
||||
lazy_static! {
|
||||
static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!(
|
||||
"pageserver_num_persistent_files_created",
|
||||
"pageserver_created_persistent_files_total",
|
||||
"Number of files created that are meant to be uploaded to cloud storage",
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!(
|
||||
"pageserver_persistent_bytes_written",
|
||||
"pageserver_written_persistent_bytes_total",
|
||||
"Total bytes written that are meant to be uploaded to cloud storage",
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
@@ -45,7 +45,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
lazy_static! {
|
||||
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_live_connections_count",
|
||||
"pageserver_live_connections",
|
||||
"Number of live network connections",
|
||||
&["pageserver_connection_kind"]
|
||||
)
|
||||
|
||||
@@ -326,7 +326,7 @@ const TIME_BUCKETS: &[f64] = &[
|
||||
|
||||
lazy_static! {
|
||||
static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_smgr_query_time",
|
||||
"pageserver_smgr_query_seconds",
|
||||
"Time spent on smgr query handling",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
TIME_BUCKETS.into()
|
||||
|
||||
@@ -208,12 +208,12 @@ lazy_static! {
|
||||
)
|
||||
.expect("failed to register pageserver remote storage remaining sync items int gauge");
|
||||
static ref FATAL_TASK_FAILURES: IntCounter = register_int_counter!(
|
||||
"pageserver_remote_storage_fatal_task_failures",
|
||||
"pageserver_remote_storage_fatal_task_failures_total",
|
||||
"Number of critically failed tasks"
|
||||
)
|
||||
.expect("failed to register pageserver remote storage remaining sync items int gauge");
|
||||
static ref IMAGE_SYNC_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_remote_storage_image_sync_time",
|
||||
"pageserver_remote_storage_image_sync_seconds",
|
||||
"Time took to synchronize (download or upload) a whole pageserver image. \
|
||||
Grouped by `operation_kind` (upload|download) and `status` (success|failure)",
|
||||
&["operation_kind", "status"],
|
||||
|
||||
@@ -34,7 +34,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
|
||||
lazy_static! {
|
||||
static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_io_time",
|
||||
"pageserver_io_operations_seconds",
|
||||
"Time spent in IO operations",
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
STORAGE_IO_TIME_BUCKETS.into()
|
||||
@@ -43,8 +43,8 @@ lazy_static! {
|
||||
}
|
||||
lazy_static! {
|
||||
static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_io_size",
|
||||
"Amount of bytes",
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"Total amount of bytes read/written in IO operations",
|
||||
&["operation", "tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
|
||||
@@ -106,16 +106,16 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
// each tenant.
|
||||
lazy_static! {
|
||||
static ref WAL_REDO_TIME: Histogram =
|
||||
register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo")
|
||||
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
|
||||
.expect("failed to define a metric");
|
||||
static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
|
||||
"pageserver_wal_redo_wait_time",
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"Time spent waiting for access to the WAL redo process"
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
|
||||
"pageserver_wal_records_replayed",
|
||||
"Number of WAL records replayed"
|
||||
"pageserver_replayed_wal_records_total",
|
||||
"Number of WAL records replayed in WAL redo process"
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
@@ -106,9 +106,9 @@ class ZenithCompare(PgCompare):
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
total_files = self.zenbenchmark.get_int_counter_value(
|
||||
self.env.pageserver, "pageserver_num_persistent_files_created")
|
||||
self.env.pageserver, "pageserver_created_persistent_files_total")
|
||||
total_bytes = self.zenbenchmark.get_int_counter_value(
|
||||
self.env.pageserver, "pageserver_persistent_bytes_written")
|
||||
self.env.pageserver, "pageserver_written_persistent_bytes_total")
|
||||
self.zenbenchmark.record("data_uploaded",
|
||||
total_bytes / (1024 * 1024),
|
||||
"MB",
|
||||
|
||||
Reference in New Issue
Block a user