Compare commits

...

3 Commits

Author SHA1 Message Date
John Spray
c281c80f0c tests: demonstrate standby_horizon issue with multiple safekeepers 2024-08-22 10:27:55 +00:00
John Spray
3cb39db80c DEBUG 2024-08-22 10:27:55 +00:00
John Spray
12b8d921be Multiple safekeepers in test_hot_standby_gc 2024-08-22 10:27:55 +00:00
2 changed files with 40 additions and 0 deletions

View File

@@ -731,6 +731,12 @@ impl ConnectionManagerState {
timeline_update.standby_horizon
);
if timeline_update.standby_horizon != 0 {
tracing::info!(
"register_timeline_update: sk={} standby_horizon={} commit_lsn={}",
timeline_update.safekeeper_id,
Lsn(timeline_update.standby_horizon),
Lsn(timeline_update.commit_lsn)
);
// ignore reports from safekeepers not connected to replicas
self.timeline
.standby_horizon
@@ -739,6 +745,12 @@ impl ConnectionManagerState {
.metrics
.standby_horizon_gauge
.set(timeline_update.standby_horizon as i64);
} else {
tracing::info!(
"register_timeline_update: sk={} standby_horizon=None commit_lsn={}",
timeline_update.safekeeper_id,
Lsn(timeline_update.commit_lsn)
);
}
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);

View File

@@ -131,6 +131,11 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
# set PITR interval to be small, so we can do GC
"pitr_interval": "0 s",
}
# Make sure that standby_horizon feedback still works when the standby and
# the pageserver might be connected to different safekeepers
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
timeline_id = env.initial_timeline
tenant_id = env.initial_tenant
@@ -189,6 +194,29 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
# generates use old not_modified_since LSNs, older than
# the GC cutoff, but new request LSNs. (In protocol
# version 1 there was only one LSN, and this failed.)
secondary.clear_shared_buffers(cursor=s_cur)
log_replica_lag(primary, secondary)
s_cur.execute("SELECT COUNT(*) FROM test")
log_replica_lag(primary, secondary)
res = s_cur.fetchone()
assert res[0] == 10000
env.safekeepers[0].stop()
# Restart the pageserver and run GC again: the standby_horizon should still be enforced
env.pageserver.stop()
env.pageserver.start()
for tenant_shard_id, pageserver in shards:
client = pageserver.http_client()
client.timeline_checkpoint(tenant_shard_id, timeline_id)
client.timeline_compact(tenant_shard_id, timeline_id)
client.timeline_gc(tenant_shard_id, timeline_id, 0)
# Re-execute the query. The GetPage requests that this
# generates use old not_modified_since LSNs, older than
# the GC cutoff, but new request LSNs. (In protocol
# version 1 there was only one LSN, and this failed.)
secondary.clear_shared_buffers(cursor=s_cur)
log_replica_lag(primary, secondary)
s_cur.execute("SELECT COUNT(*) FROM test")
log_replica_lag(primary, secondary)