diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index a10ef70aa2..1254c4e779 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -14,6 +14,7 @@ from fixtures.neon_fixtures import ( PgProtocol, RemotePostgres, VanillaPostgres, + wait_for_last_flush_lsn, ) from fixtures.pg_stats import PgStatTable @@ -129,6 +130,7 @@ class NeonCompare(PgCompare): return self._pg_bin def flush(self): + wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline) self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline) self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 7d6dec4a14..3197017bd1 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -154,16 +154,17 @@ def wait_for_last_record_lsn( lsn: Lsn, ) -> Lsn: """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" - for i in range(10): + for i in range(100): current_lsn = last_record_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: return current_lsn - log.info( - "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 + if i % 10 == 0: + log.info( + "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn, current_lsn, i + 1 + ) ) - ) - time.sleep(1) + time.sleep(0.1) raise Exception( "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) ) diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index d6e67aa361..92da2dae96 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,6 +1,8 @@ +import shutil from contextlib import closing -from fixtures.compare_fixtures import PgCompare +from fixtures.compare_fixtures import NeonCompare, PgCompare +from fixtures.pg_version import PgVersion # @@ -28,3 +30,30 @@ def test_bulk_insert(neon_with_baseline: PgCompare): env.report_peak_memory_use() env.report_size() + + # When testing neon, also check how long it takes the pageserver to reingest the + # wal from safekeepers. If this number is close to total runtime, then the pageserver + # is the bottleneck. + if isinstance(env, NeonCompare): + measure_recovery_time(env) + + +def measure_recovery_time(env: NeonCompare): + client = env.env.pageserver.http_client() + pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"]) + + # Stop pageserver and remove tenant data + env.env.pageserver.stop() + timeline_dir = env.env.timeline_dir(env.tenant, env.timeline) + shutil.rmtree(timeline_dir) + + # Start pageserver + env.env.pageserver.start() + + # Measure recovery time + with env.record_duration("wal_recovery"): + # Create the tenant, which will start walingest + client.timeline_create(pg_version, env.tenant, env.timeline) + + # Flush, which will also wait for lsn to catch up + env.flush()