Measure pageserver wal recovery time and fix flush() method (#5240)

This commit is contained in:
bojanserafimov
2023-09-11 09:46:06 -04:00
committed by GitHub
parent d7fa2dba2d
commit c0ed362790
3 changed files with 39 additions and 7 deletions

View File

@@ -1,6 +1,8 @@
import shutil
from contextlib import closing
from fixtures.compare_fixtures import PgCompare
from fixtures.compare_fixtures import NeonCompare, PgCompare
from fixtures.pg_version import PgVersion
#
@@ -28,3 +30,30 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
env.report_peak_memory_use()
env.report_size()
# When testing neon, also check how long it takes the pageserver to reingest the
# wal from safekeepers. If this number is close to total runtime, then the pageserver
# is the bottleneck.
if isinstance(env, NeonCompare):
measure_recovery_time(env)
def measure_recovery_time(env: NeonCompare):
client = env.env.pageserver.http_client()
pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"])
# Stop pageserver and remove tenant data
env.env.pageserver.stop()
timeline_dir = env.env.timeline_dir(env.tenant, env.timeline)
shutil.rmtree(timeline_dir)
# Start pageserver
env.env.pageserver.start()
# Measure recovery time
with env.record_duration("wal_recovery"):
# Create the tenant, which will start walingest
client.timeline_create(pg_version, env.tenant, env.timeline)
# Flush, which will also wait for lsn to catch up
env.flush()