From 643448b1a23f48bee3e0c9c679ad9b787e46b73d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 24 Jul 2025 16:00:22 +0200 Subject: [PATCH] test_hot_standby_gc: work around standby_horizon-related flakiness/raciness uncovered by #12431 (#12704) PR #12431 set initial lease deadline = 0s for tests. This turned test_hot_standby_gc flaky because it now runs GC: it started failing with `tried to request a page version that was garbage collected` because the replica reads below applied gc cutoff. The leading theory is that, we run the timeline_gc() before the first standby_horizon push arrives at PS. That is definitively a thing that can happen with the current standby_horizon mechanism, and it's now tracked as such in https://databricks.atlassian.net/browse/LKB-2499. We don't have logs to confirm this theory though, but regardless, try the fix in this PR and see if it stabilizes things. Refs - flaky test issue: https://databricks.atlassian.net/browse/LKB-2465 ## Problem ## Summary of changes --- test_runner/regress/test_hot_standby.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 1ff61ce8dc..a329a5e842 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -133,6 +133,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): tenant_conf = { # set PITR interval to be small, so we can do GC "pitr_interval": "0 s", + # we want to control gc and checkpoint frequency precisely + "gc_period": "0s", + "compaction_period": "0s", } env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) timeline_id = env.initial_timeline @@ -186,6 +189,23 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): client = pageserver.http_client() client.timeline_checkpoint(tenant_shard_id, timeline_id) client.timeline_compact(tenant_shard_id, timeline_id) + # Wait for standby horizon to get propagated. + # This shouldn't be necessary, but the current mechanism for + # standby_horizon propagation is imperfect. Detailed + # description in https://databricks.atlassian.net/browse/LKB-2499 + while True: + val = client.get_metric_value( + "pageserver_standby_horizon", + { + "tenant_id": str(tenant_shard_id.tenant_id), + "shard_id": str(tenant_shard_id.shard_index), + "timeline_id": str(timeline_id), + }, + ) + log.info("waiting for next standby_horizon push from safekeeper, {val=}") + if val != 0: + break + time.sleep(0.1) client.timeline_gc(tenant_shard_id, timeline_id, 0) # Re-execute the query. The GetPage requests that this