mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 21:42:56 +00:00
Measure pageserver wal recovery time and fix flush() method (#5240)
This commit is contained in:
@@ -14,6 +14,7 @@ from fixtures.neon_fixtures import (
|
||||
PgProtocol,
|
||||
RemotePostgres,
|
||||
VanillaPostgres,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pg_stats import PgStatTable
|
||||
|
||||
@@ -129,6 +130,7 @@ class NeonCompare(PgCompare):
|
||||
return self._pg_bin
|
||||
|
||||
def flush(self):
|
||||
wait_for_last_flush_lsn(self.env, self._pg, self.tenant, self.timeline)
|
||||
self.pageserver_http_client.timeline_checkpoint(self.tenant, self.timeline)
|
||||
self.pageserver_http_client.timeline_gc(self.tenant, self.timeline, 0)
|
||||
|
||||
|
||||
@@ -154,16 +154,17 @@ def wait_for_last_record_lsn(
|
||||
lsn: Lsn,
|
||||
) -> Lsn:
|
||||
"""waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
|
||||
for i in range(10):
|
||||
for i in range(100):
|
||||
current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
return current_lsn
|
||||
log.info(
|
||||
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn, current_lsn, i + 1
|
||||
if i % 10 == 0:
|
||||
log.info(
|
||||
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn, current_lsn, i + 1
|
||||
)
|
||||
)
|
||||
)
|
||||
time.sleep(1)
|
||||
time.sleep(0.1)
|
||||
raise Exception(
|
||||
"timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn)
|
||||
)
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import shutil
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.compare_fixtures import PgCompare
|
||||
from fixtures.compare_fixtures import NeonCompare, PgCompare
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
#
|
||||
@@ -28,3 +30,30 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
|
||||
env.report_peak_memory_use()
|
||||
env.report_size()
|
||||
|
||||
# When testing neon, also check how long it takes the pageserver to reingest the
|
||||
# wal from safekeepers. If this number is close to total runtime, then the pageserver
|
||||
# is the bottleneck.
|
||||
if isinstance(env, NeonCompare):
|
||||
measure_recovery_time(env)
|
||||
|
||||
|
||||
def measure_recovery_time(env: NeonCompare):
|
||||
client = env.env.pageserver.http_client()
|
||||
pg_version = PgVersion(client.timeline_detail(env.tenant, env.timeline)["pg_version"])
|
||||
|
||||
# Stop pageserver and remove tenant data
|
||||
env.env.pageserver.stop()
|
||||
timeline_dir = env.env.timeline_dir(env.tenant, env.timeline)
|
||||
shutil.rmtree(timeline_dir)
|
||||
|
||||
# Start pageserver
|
||||
env.env.pageserver.start()
|
||||
|
||||
# Measure recovery time
|
||||
with env.record_duration("wal_recovery"):
|
||||
# Create the tenant, which will start walingest
|
||||
client.timeline_create(pg_version, env.tenant, env.timeline)
|
||||
|
||||
# Flush, which will also wait for lsn to catch up
|
||||
env.flush()
|
||||
|
||||
Reference in New Issue
Block a user