From db4059cd6dea4a11bb04354625baed289630d784 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 7 Oct 2021 18:03:20 +0300 Subject: [PATCH] Measure peak memory usage in perf test. Another useful metric to keep an eye on. --- test_runner/fixtures/benchmark_fixture.py | 11 +++++++++++ test_runner/performance/test_bulk_insert.py | 4 ++++ zenith_metrics/src/lib.rs | 10 ++++++++-- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index cced6ca1dd..e972b7bf9e 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -140,6 +140,17 @@ class ZenithBenchmarker: re.MULTILINE) return int(round(float(matches.group(1)))) + def get_peak_mem(self, pageserver) -> int: + """ + Fetch the "maxrss" metric from the pageserver + """ + # Fetch all the exposed prometheus metrics from page server + all_metrics = pageserver.http_client().get_metrics() + # See comment in get_io_writes() + matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, + re.MULTILINE) + return int(round(float(matches.group(1)))) + @contextmanager def record_pageserver_writes(self, pageserver, metric_name): """ diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index 24e154a5bb..65a5ca8de9 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -25,6 +25,7 @@ def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str): # 1. Time to INSERT 5 million rows # 2. Disk writes # 3. Disk space used +# 4. Peak memory usage # def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str): # Create a branch for us @@ -55,6 +56,9 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg # time and I/O pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") + # Record peak memory usage + zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB') + # Report disk space used by the repository timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline) zenbenchmark.record('size', timeline_size / (1024*1024), 'MB') diff --git a/zenith_metrics/src/lib.rs b/zenith_metrics/src/lib.rs index 639ba396dc..c972bae72a 100644 --- a/zenith_metrics/src/lib.rs +++ b/zenith_metrics/src/lib.rs @@ -21,7 +21,7 @@ pub use wrappers::{CountedReader, CountedWriter}; /// Metrics gathering is a relatively simple and standalone operation, so /// it might be fine to do it this way to keep things simple. pub fn gather() -> Vec { - update_io_metrics(); + update_rusage_metrics(); prometheus::gather() } @@ -52,6 +52,11 @@ lazy_static! { &["io_operation"] ) .expect("Failed to register disk i/o bytes int gauge vec"); + static ref MAXRSS_KB: IntGauge = register_int_gauge!( + new_common_metric_name("maxrss_kb"), + "Memory usage (Maximum Resident Set Size)" + ) + .expect("Failed to register maxrss_kb int gauge"); } // Records I/O stats in a "cross-platform" way. @@ -63,7 +68,7 @@ lazy_static! { // We know the size of the block, so we can determine the I/O bytes out of it. // The value might be not 100% exact, but should be fine for Prometheus metrics in this case. #[allow(clippy::unnecessary_cast)] -fn update_io_metrics() { +fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); const BYTES_IN_BLOCK: i64 = 512; @@ -73,6 +78,7 @@ fn update_io_metrics() { DISK_IO_BYTES .with_label_values(&["write"]) .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK); + MAXRSS_KB.set(rusage_stats.ru_maxrss); } fn get_rusage_stats() -> libc::rusage {