Measure pageserver wal recovery time and fix flush() method (#5240)

This commit is contained in:
bojanserafimov
2023-09-11 09:46:06 -04:00
committed by GitHub
parent d7fa2dba2d
commit c0ed362790
3 changed files with 39 additions and 7 deletions

View File

@@ -14,6 +14,7 @@ from fixtures.neon_fixtures import (
PgProtocol,
RemotePostgres,
VanillaPostgres,
wait_for_last_flush_lsn,
)
from fixtures.pg_stats import PgStatTable
@@ -129,6 +130,7 @@ class NeonCompare(PgCompare):
return self._pg_bin
def flush(self):
wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline)
self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline)
self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0)

View File

@@ -154,16 +154,17 @@ def wait_for_last_record_lsn(
lsn: Lsn,
) -> Lsn:
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
for i in range(10):
for i in range(100):
current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
if current_lsn >= lsn:
return current_lsn
log.info(
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
lsn, current_lsn, i + 1
if i % 10 == 0:
log.info(
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
lsn, current_lsn, i + 1
)
)
)
time.sleep(1)
time.sleep(0.1)
raise Exception(
"timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
)

View File

@@ -1,6 +1,8 @@
import shutil
from contextlib import closing
from fixtures.compare_fixtures import PgCompare
from fixtures.compare_fixtures import NeonCompare, PgCompare
from fixtures.pg_version import PgVersion
#
@@ -28,3 +30,30 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
env.report_peak_memory_use()
env.report_size()
# When testing neon, also check how long it takes the pageserver to reingest the
# wal from safekeepers. If this number is close to total runtime, then the pageserver
# is the bottleneck.
if isinstance(env, NeonCompare):
measure_recovery_time(env)
def measure_recovery_time(env: NeonCompare):
client = env.env.pageserver.http_client()
pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"])
# Stop pageserver and remove tenant data
env.env.pageserver.stop()
timeline_dir = env.env.timeline_dir(env.tenant, env.timeline)
shutil.rmtree(timeline_dir)
# Start pageserver
env.env.pageserver.start()
# Measure recovery time
with env.record_duration("wal_recovery"):
# Create the tenant, which will start walingest
client.timeline_create(pg_version, env.tenant, env.timeline)
# Flush, which will also wait for lsn to catch up
env.flush()