From 643448b1a23f48bee3e0c9c679ad9b787e46b73d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 24 Jul 2025 16:00:22 +0200
Subject: [PATCH] test_hot_standby_gc: work around standby_horizon-related
 flakiness/raciness uncovered by #12431 (#12704)

PR #12431 set initial lease deadline = 0s for tests.
This turned test_hot_standby_gc flaky because it now runs GC: it started
failing with `tried to request a page version that was garbage
collected`
because the replica reads below applied gc cutoff.

The leading theory is that, we run the timeline_gc() before the first
standby_horizon push arrives at PS. That is definitively a thing that
can happen with the current standby_horizon mechanism, and it's now
tracked as such in https://databricks.atlassian.net/browse/LKB-2499.

We don't have logs to confirm this theory though, but regardless,
try the fix in this PR and see if it stabilizes things.

Refs
- flaky test issue: https://databricks.atlassian.net/browse/LKB-2465

## Problem

## Summary of changes
---
 test_runner/regress/test_hot_standby.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 1ff61ce8dc..a329a5e842 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -133,6 +133,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
     tenant_conf = {
         # set PITR interval to be small, so we can do GC
         "pitr_interval": "0 s",
+        # we want to control gc and checkpoint frequency precisely
+        "gc_period": "0s",
+        "compaction_period": "0s",
     }
     env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
     timeline_id = env.initial_timeline
@@ -186,6 +189,23 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
                 client = pageserver.http_client()
                 client.timeline_checkpoint(tenant_shard_id, timeline_id)
                 client.timeline_compact(tenant_shard_id, timeline_id)
+                # Wait for standby horizon to get propagated.
+                # This shouldn't be necessary, but the current mechanism for
+                # standby_horizon propagation is imperfect. Detailed
+                # description in https://databricks.atlassian.net/browse/LKB-2499
+                while True:
+                    val = client.get_metric_value(
+                        "pageserver_standby_horizon",
+                        {
+                            "tenant_id": str(tenant_shard_id.tenant_id),
+                            "shard_id": str(tenant_shard_id.shard_index),
+                            "timeline_id": str(timeline_id),
+                        },
+                    )
+                    log.info("waiting for next standby_horizon push from safekeeper, {val=}")
+                    if val != 0:
+                        break
+                    time.sleep(0.1)
                 client.timeline_gc(tenant_shard_id, timeline_id, 0)
 
             # Re-execute the query. The GetPage requests that this