diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 43e6646d0c..86ca78d000 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -1,6 +1,7 @@ from pprint import pprint import os +import re import timeit import pathlib import uuid @@ -120,6 +121,35 @@ class ZenithBenchmarker: self.results.record(self.request.node.name, metric_name, end - start, 's') + def get_io_writes(self, pageserver) -> int: + """ + Fetch the "cumulative # of bytes written" metric from the pageserver + """ + # Fetch all the exposed prometheus metrics from page server + all_metrics = pageserver.http_client().get_metrics() + # Use a regular expression to extract the one we're interested in + # + # TODO: If we start to collect more of the prometheus metrics in the + # performance test suite like this, we should refactor this to load and + # parse all the metrics into a more convenient structure in one go. + # + # The metric should be an integer, as it's a number of bytes. But in general + # all prometheus metrics are floats. So to be pedantic, read it as a float + # and round to integer. + matches = re.search(r'pageserver_disk_io_bytes{io_operation="write"} (\S+)', all_metrics) + return int(round(float(matches.group(1)))) + + @contextmanager + def record_pageserver_writes(self, pageserver, metric_name): + """ + Record bytes written by the pageserver during a test. + """ + before = self.get_io_writes(pageserver) + yield + after = self.get_io_writes(pageserver) + + self.results.record(self.request.node.name, metric_name, round((after - before) / (1024 * 1024)), 'MB') + @pytest.fixture(scope='function') def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]: """ diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index bef4acbd4a..573649b520 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -226,6 +226,11 @@ class ZenithPageserverHttpClient(requests.Session): res.raise_for_status() return res.json() + def get_metrics(self) -> str: + res = self.get(f"http://localhost:{self.port}/metrics") + res.raise_for_status() + return res.text + @dataclass class AuthKeys: diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 91485df1dc..7e0f19bec8 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -46,13 +46,14 @@ def test_pgbench(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin connstr = pg.connstr() - # Initialize pgbench database - with zenbenchmark.record_duration('init'): - pg_bin.run_capture(['pgbench', '-s5', '-i', connstr]) + # Initialize pgbench database, recording the time and I/O it takes + with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'): + with zenbenchmark.record_duration('init'): + pg_bin.run_capture(['pgbench', '-s5', '-i', connstr]) - # Flush the layers from memory to disk. The time to do that is included in the - # reported init time. - pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") + # Flush the layers from memory to disk. This is included in the reported + # time and I/O + pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0") # Run pgbench for 5000 transactions with zenbenchmark.record_duration('5000_xacts'):