From 212920e47ebc1aef86456738e312f3b046f9d2ba Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 1 Sep 2021 18:24:42 +0300 Subject: [PATCH] Collect and expose I/O disk write metrics --- Cargo.lock | 9 +++-- zenith_metrics/Cargo.toml | 7 +--- zenith_metrics/src/lib.rs | 67 ++++++++++++++++++++++++++++++- zenith_utils/src/http/endpoint.rs | 1 + 4 files changed, 74 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c1b737ca94..e21bbf5092 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -903,9 +903,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.98" +version = "0.2.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" +checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21" [[package]] name = "libloading" @@ -1536,9 +1536,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.5.4" +version = "1.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" dependencies = [ "aho-corasick", "memchr", @@ -2619,6 +2619,7 @@ name = "zenith_metrics" version = "0.1.0" dependencies = [ "lazy_static", + "libc", "prometheus", ] diff --git a/zenith_metrics/Cargo.toml b/zenith_metrics/Cargo.toml index bf605dd7c7..c4998e1c92 100644 --- a/zenith_metrics/Cargo.toml +++ b/zenith_metrics/Cargo.toml @@ -3,10 +3,7 @@ name = "zenith_metrics" version = "0.1.0" edition = "2018" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] prometheus = "0.12" - -[dev-dependencies] -lazy_static = "1.4.0" +libc = "0.2" +lazy_static = "1.4" diff --git a/zenith_metrics/src/lib.rs b/zenith_metrics/src/lib.rs index 5f858d7029..47e652943a 100644 --- a/zenith_metrics/src/lib.rs +++ b/zenith_metrics/src/lib.rs @@ -2,7 +2,7 @@ //! make sure that we use the same dep version everywhere. //! Otherwise, we might not see all metrics registered via //! a default registry. -pub use prometheus::gather; +use lazy_static::lazy_static; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_histogram, Histogram}; pub use prometheus::{register_histogram_vec, HistogramVec}; @@ -13,4 +13,69 @@ pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; mod wrappers; +use libc::{c_long, getrusage, rusage, suseconds_t, time_t, timeval, RUSAGE_SELF}; pub use wrappers::{CountedReader, CountedWriter}; + +/// Gathers all Prometheus metrics and records the I/O stats just before that. +/// +/// Metrics gathering is a relatively simple and standalone operation, so +/// it might be fine to do it this way to keep things simple. +pub fn gather() -> Vec { + update_io_metrics(); + prometheus::gather() +} + +lazy_static! { + static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!( + "pageserver_disk_io_bytes", + "Bytes written and read from disk, grouped by the operation (read|write)", + &["io_operation"] + ) + .expect("Failed to register disk i/o bytes int gauge vec"); +} + +// Records I/O stats in a "cross-platform" way. +// Compiles both on macOs and Linux, but current macOs implementation always returns 0 as values for I/O stats. +// An alternative is to read procfs (`/proc/[pid]/io`) which does not work under macOs at all, hence abandoned. +// +// Uses https://www.freebsd.org/cgi/man.cgi?query=getrusage to retrieve the number of block operations +// performed by the process. +// We know the the size of the block, so we can determine the I/O bytes out of it. +// The value might be not 100% exact, but should be fine for Prometheus metrics in this case. +fn update_io_metrics() { + let mut usage = rusage { + ru_utime: timeval { + tv_sec: 0 as time_t, + tv_usec: 0 as suseconds_t, + }, + ru_stime: timeval { + tv_sec: 0 as time_t, + tv_usec: 0 as suseconds_t, + }, + ru_maxrss: 0 as c_long, + ru_ixrss: 0 as c_long, + ru_idrss: 0 as c_long, + ru_isrss: 0 as c_long, + ru_minflt: 0 as c_long, + ru_majflt: 0 as c_long, + ru_nswap: 0 as c_long, + ru_inblock: 0 as c_long, + ru_oublock: 0 as c_long, + ru_msgsnd: 0 as c_long, + ru_msgrcv: 0 as c_long, + ru_nsignals: 0 as c_long, + ru_nvcsw: 0 as c_long, + ru_nivcsw: 0 as c_long, + }; + unsafe { + getrusage(RUSAGE_SELF, (&mut usage) as *mut rusage); + } + + const BYTES_IN_BLOCK: i64 = 512; + DISK_IO_BYTES + .with_label_values(&["read"]) + .set(usage.ru_inblock * BYTES_IN_BLOCK); + DISK_IO_BYTES + .with_label_values(&["write"]) + .set(usage.ru_oublock * BYTES_IN_BLOCK); +} diff --git a/zenith_utils/src/http/endpoint.rs b/zenith_utils/src/http/endpoint.rs index e6239aaa5d..2c476a2758 100644 --- a/zenith_utils/src/http/endpoint.rs +++ b/zenith_utils/src/http/endpoint.rs @@ -31,6 +31,7 @@ async fn prometheus_metrics_handler(_req: Request) -> Result