From 21cf4a3e11c042576d5774a5edfa1cbc3f505688 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 3 Sep 2021 09:00:26 +0300 Subject: [PATCH] Include # of bytes written in pgbench benchmark result Now that the page server collects this metric (since commit 212920e47e), let's include it in the performance test results The new metric looks like this: performance/test_perf_pgbench.py . [100%] --------------- Benchmark results ---------------- test_pgbench.init: 6.784 s test_pgbench.pageserver_writes: 466 MB <---- THIS IS NEW test_pgbench.5000_xacts: 8.196 s test_pgbench.size: 163 MB =============== 1 passed in 21.00s =============== --- test_runner/fixtures/benchmark_fixture.py | 30 ++++++++++++++++++++ test_runner/fixtures/zenith_fixtures.py | 5 ++++ test_runner/performance/test_perf_pgbench.py | 13 +++++---- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 43e6646d0c..86ca78d000 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -1,6 +1,7 @@ from pprint import pprint import os +import re import timeit import pathlib import uuid @@ -120,6 +121,35 @@ class ZenithBenchmarker: self.results.record(self.request.node.name, metric_name, end - start, 's') + def get_io_writes(self, pageserver) -> int: + """ + Fetch the "cumulative # of bytes written" metric from the pageserver + """ + # Fetch all the exposed prometheus metrics from page server + all_metrics = pageserver.http_client().get_metrics() + # Use a regular expression to extract the one we're interested in + # + # TODO: If we start to collect more of the prometheus metrics in the + # performance test suite like this, we should refactor this to load and + # parse all the metrics into a more convenient structure in one go. + # + # The metric should be an integer, as it's a number of bytes. But in general + # all prometheus metrics are floats. So to be pedantic, read it as a float + # and round to integer. + matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics) + return int(round(float(matches.group(1)))) + + @contextmanager + def record_pageserver_writes(self, pageserver, metric_name): + """ + Record bytes written by the pageserver during a test. + """ + before = self.get_io_writes(pageserver) + yield + after = self.get_io_writes(pageserver) + + self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB') + @pytest.fixture(scope='function') def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]: """ diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index bef4acbd4a..573649b520 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -226,6 +226,11 @@ class ZenithPageserverHttpClient(requests.Session): res.raise_for_status() return res.json() + def get_metrics(self) -> str: + res = self.get(f"http://localhost:{self.port}/metrics") + res.raise_for_status() + return res.text + @dataclass class AuthKeys: diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 91485df1dc..7e0f19bec8 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -46,13 +46,14 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin connstr = pg.connstr() - # Initialize pgbench database - with zenbenchmark.record_duration('init'): - pg_bin.run_capture(['pgbench', '-s5', '-i', connstr]) + # Initialize pgbench database, recording the time and I/O it takes + with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'): + with zenbenchmark.record_duration('init'): + pg_bin.run_capture(['pgbench', '-s5', '-i', connstr]) - # Flush the layers from memory to disk. The time to do that is included in the - # reported init time. - pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") + # Flush the layers from memory to disk. This is included in the reported + # time and I/O + pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") # Run pgbench for 5000 transactions with zenbenchmark.record_duration('5000_xacts'):