From db4059cd6dea4a11bb04354625baed289630d784 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@zenith.tech>
Date: Thu, 7 Oct 2021 18:03:20 +0300
Subject: [PATCH] Measure peak memory usage in perf test.

Another useful metric to keep an eye on.
---
 test_runner/fixtures/benchmark_fixture.py   | 11 +++++++++++
 test_runner/performance/test_bulk_insert.py |  4 ++++
 zenith_metrics/src/lib.rs                   | 10 ++++++++--
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index cced6ca1dd..e972b7bf9e 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -140,6 +140,17 @@ class ZenithBenchmarker:
                             re.MULTILINE)
         return int(round(float(matches.group(1))))
 
+    def get_peak_mem(self, pageserver) -> int:
+        """
+        Fetch the "maxrss" metric from the pageserver
+        """
+        # Fetch all the exposed prometheus metrics from page server
+        all_metrics = pageserver.http_client().get_metrics()
+        # See comment in get_io_writes()
+        matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics,
+                            re.MULTILINE)
+        return int(round(float(matches.group(1))))
+
     @contextmanager
     def record_pageserver_writes(self, pageserver, metric_name):
         """
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 24e154a5bb..65a5ca8de9 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -25,6 +25,7 @@ def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
 # 1. Time to INSERT 5 million rows
 # 2. Disk writes
 # 3. Disk space used
+# 4. Peak memory usage
 #
 def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
     # Create a branch for us
@@ -55,6 +56,9 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg
                     # time and I/O
                     pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
 
+            # Record peak memory usage
+            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
+
             # Report disk space used by the repository
             timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
             zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
diff --git a/zenith_metrics/src/lib.rs b/zenith_metrics/src/lib.rs
index 639ba396dc..c972bae72a 100644
--- a/zenith_metrics/src/lib.rs
+++ b/zenith_metrics/src/lib.rs
@@ -21,7 +21,7 @@ pub use wrappers::{CountedReader, CountedWriter};
 /// Metrics gathering is a relatively simple and standalone operation, so
 /// it might be fine to do it this way to keep things simple.
 pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
-    update_io_metrics();
+    update_rusage_metrics();
     prometheus::gather()
 }
 
@@ -52,6 +52,11 @@ lazy_static! {
         &["io_operation"]
     )
     .expect("Failed to register disk i/o bytes int gauge vec");
+    static ref MAXRSS_KB: IntGauge = register_int_gauge!(
+        new_common_metric_name("maxrss_kb"),
+        "Memory usage (Maximum Resident Set Size)"
+    )
+    .expect("Failed to register maxrss_kb int gauge");
 }
 
 // Records I/O stats in a "cross-platform" way.
@@ -63,7 +68,7 @@ lazy_static! {
 // We know the size of the block, so we can determine the I/O bytes out of it.
 // The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
 #[allow(clippy::unnecessary_cast)]
-fn update_io_metrics() {
+fn update_rusage_metrics() {
     let rusage_stats = get_rusage_stats();
 
     const BYTES_IN_BLOCK: i64 = 512;
@@ -73,6 +78,7 @@ fn update_io_metrics() {
     DISK_IO_BYTES
         .with_label_values(&["write"])
         .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
+    MAXRSS_KB.set(rusage_stats.ru_maxrss);
 }
 
 fn get_rusage_stats() -> libc::rusage {