From ff174a88c0544d1270a6f993810d67a6eb4cc0b2 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 18 Jul 2024 00:03:02 +0300
Subject: [PATCH] test: allow requests to any pageserver get cancelled (#8413)

Fix flakyness on `test_sharded_timeline_detach_ancestor` which does not
reproduce on a fast enough runner by allowing cancelled request before
completing on all pageservers. It was only allowed on half of the
pageservers.

Failure evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8352/9972357040/index.html#suites/a1c2be32556270764423c495fad75d47/7cca3e3d94fe12f2
---
 .../regress/test_timeline_detach_ancestor.py  | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index d75ab4c060..38f8dfa885 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -702,20 +702,16 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     # make another of the nodes get stuck, then restart
 
     stuck = pageservers[int(shards[0]["node_id"])]
-    stuck.allowed_errors.append(".*: request was dropped before completing")
-    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+    log.info(f"stuck pageserver is id={stuck.id}")
     stuck_http = stuck.http_client()
     stuck_http.configure_failpoints(
         ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
     )
 
     restarted = pageservers[int(shards[1]["node_id"])]
-    restarted.allowed_errors.extend(
-        [
-            ".*: request was dropped before completing",
-            ".*: Cancelled request finished with an error: ShuttingDown",
-        ]
-    )
+    log.info(f"restarted pageserver is id={restarted.id}")
+    # this might be hit; see `restart_restarted`
+    restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown")
     assert restarted.id != stuck.id
     restarted_http = restarted.http_client()
     restarted_http.configure_failpoints(
@@ -724,6 +720,14 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         ]
     )
 
+    for info in shards:
+        pageserver = pageservers[int(info["node_id"])]
+        # the first request can cause these, but does not repeatedly
+        pageserver.allowed_errors.append(".*: request was dropped before completing")
+
+    # first request again
+    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+
     target = env.storage_controller.pageserver_api()
 
     with pytest.raises(ReadTimeout):