Measure peak memory usage in perf test.

Another useful metric to keep an eye on.
2026-05-14 03:30:36 +00:00 · 2021-10-07 18:03:20 +03:00
parent fdb19fdb92
commit db4059cd6d
3 changed files with 23 additions and 2 deletions
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -140,6 +140,17 @@ class ZenithBenchmarker:
                            re.MULTILINE)
        return int(round(float(matches.group(1))))

+    def get_peak_mem(self, pageserver) -> int:
+        """
+        Fetch the "maxrss" metric from the pageserver
+        """
+        # Fetch all the exposed prometheus metrics from page server
+        all_metrics = pageserver.http_client().get_metrics()
+        # See comment in get_io_writes()
+        matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics,
+                            re.MULTILINE)
+        return int(round(float(matches.group(1))))
+
    @contextmanager
    def record_pageserver_writes(self, pageserver, metric_name):
        """
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -25,6 +25,7 @@ def get_timeline_size(repo_dir: str, tenantid: str, timelineid: str):
 # 1. Time to INSERT 5 million rows
 # 2. Disk writes
 # 3. Disk space used
+# 4. Peak memory usage
 #
 def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
    # Create a branch for us
@@ -55,6 +56,9 @@ def test_bulk_insert(postgres: PostgresFactory, pageserver: ZenithPageserver, pg
                    # time and I/O
                    pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")

+            # Record peak memory usage
+            zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(pageserver) / 1024, 'MB')
+
            # Report disk space used by the repository
            timeline_size = get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')
--- a/zenith_metrics/src/lib.rs
+++ b/zenith_metrics/src/lib.rs
@@ -21,7 +21,7 @@ pub use wrappers::{CountedReader, CountedWriter};
 /// Metrics gathering is a relatively simple and standalone operation, so
 /// it might be fine to do it this way to keep things simple.
 pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
-    update_io_metrics();
+    update_rusage_metrics();
    prometheus::gather()
 }

@@ -52,6 +52,11 @@ lazy_static! {
        &["io_operation"]
    )
    .expect("Failed to register disk i/o bytes int gauge vec");
+    static ref MAXRSS_KB: IntGauge = register_int_gauge!(
+        new_common_metric_name("maxrss_kb"),
+        "Memory usage (Maximum Resident Set Size)"
+    )
+    .expect("Failed to register maxrss_kb int gauge");
 }

 // Records I/O stats in a "cross-platform" way.
@@ -63,7 +68,7 @@ lazy_static! {
 // We know the size of the block, so we can determine the I/O bytes out of it.
 // The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
 #[allow(clippy::unnecessary_cast)]
-fn update_io_metrics() {
+fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

    const BYTES_IN_BLOCK: i64 = 512;
@@ -73,6 +78,7 @@ fn update_io_metrics() {
    DISK_IO_BYTES
        .with_label_values(&["write"])
        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
+    MAXRSS_KB.set(rusage_stats.ru_maxrss);
 }

 fn get_rusage_stats() -> libc::rusage {