From 8884865bca564033b3e1632d088f21651f1a90be Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 11 Apr 2025 11:38:05 +0200
Subject: [PATCH] tests: make `test_pageserver_getpage_throttle` less flaky 
 (#11482)

# Refs

- fixes https://github.com/neondatabase/neon/issues/11395

# Problem

Since 2025-03-10, we have observed increased flakiness of
`test_pageserver_getpage_throttle`.

The test is timing-dependent by nature, and was hitting the

```
 assert duration_secs >= 10 * actual_smgr_query_seconds, (
        "smgr metrics should not include throttle wait time"
    )
```

quite frequently.

# Analysis

These failures are not reproducible.

In this PR's history is a commit that reran the test 100 times without
requiring a single retry.

In https://github.com/neondatabase/neon/issues/11395 there is a link to
a query to the test results database.
It shows that the flakiness was not constant, but rather episodic:
2025-03-{10,11,12,13} 2025-03-{19,20,21} 2025-03-31 and 2025-04-01.

To me, this suggests variability in available CPU.

# Solution

The point of the offending assertion is to ensure that most of the
request latency is spent on throttling, because testing of the
throttling mechanism is the point of the test.
The `10` magic number means at most 10% of mean latency may be spent on
request processing.

Ideally we would control the passage of time (virtual clock source) to
make this test deterministic.

But I don't see that happening in our regression test setup.

So, this PR de-flakes the test as follows:
- allot up to 66% of mean latency for request processing
- increase duration from 10s to 20s, hoping to get better protection
from momentary CPU spikes in noisy neighbor tests or VMs on the runner
host

As a drive-by, switch to `pytest.approx` and remove one self-test
assertion I can't make sense of anymore.
---
 .../test_pageserver_getpage_throttle.py       | 24 ++++++++-----------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index bc186477e1..5ef63e2fe9 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -15,7 +15,6 @@ if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
 def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env = neon_env_builder.init_start()
 
@@ -96,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
 
     log.info("run pagebench")
-    duration_secs = 10
+    duration_secs = 20
     actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
 
     log.info("validate the client is capped at the configured rps limit")
     expect_ncompleted = duration_secs * rate_limit_rps
-    delta_abs = abs(expect_ncompleted - actual_ncompleted)
-    threshold = 0.05 * expect_ncompleted
-    assert threshold / rate_limit_rps < 0.1 * duration_secs, (
-        "test self-test: unrealistic expecations regarding precision in this test"
-    )
-    assert delta_abs < 0.05 * expect_ncompleted, (
+    assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, (
         "the throttling deviates more than 5percent from the expectation"
     )
 
@@ -120,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         timeout=compaction_period,
     )
 
+    log.info("validate the metrics")
     smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
     throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -128,12 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
     actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
-    log.info("validate that the metric doesn't include throttle wait time")
-    assert duration_secs >= 10 * actual_smgr_query_seconds, (
-        "smgr metrics should not include throttle wait time"
+    assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
+        "throttling and processing latency = total request time; this assert validates thi holds on average"
     )
 
-    log.info("validate that the throttling wait time metrics is correct")
-    assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
-        "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
+    # without this assertion, the test would pass even if the throttling was completely broken
+    # but the request processing is so slow that it makes up for the latency that a correct throttling
+    # implementation would add
+    assert actual_smgr_query_seconds < 0.66 * duration_secs, (
+        "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling"
     )