test: allow requests to any pageserver get cancelled (#8413)

Fix flakyness on `test_sharded_timeline_detach_ancestor` which does not reproduce on a fast enough runner by allowing cancelled request before completing on all pageservers. It was only allowed on half of the pageservers. Failure evidence: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8352/9972357040/index.html#suites/a1c2be32556270764423c495fad75d47/7cca3e3d94fe12f2
2026-01-09 06:22:57 +00:00 · 2024-07-18 00:03:02 +03:00
parent 0c236fa465
commit e250b9e063
1 changed files with 12 additions and 8 deletions
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -702,20 +702,16 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
    # make another of the nodes get stuck, then restart

    stuck = pageservers[int(shards[0]["node_id"])]
-    stuck.allowed_errors.append(".*: request was dropped before completing")
-    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+    log.info(f"stuck pageserver is id={stuck.id}")
    stuck_http = stuck.http_client()
    stuck_http.configure_failpoints(
        ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
    )

    restarted = pageservers[int(shards[1]["node_id"])]
-    restarted.allowed_errors.extend(
-        [
-            ".*: request was dropped before completing",
-            ".*: Cancelled request finished with an error: ShuttingDown",
-        ]
-    )
+    log.info(f"restarted pageserver is id={restarted.id}")
+    # this might be hit; see `restart_restarted`
+    restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown")
    assert restarted.id != stuck.id
    restarted_http = restarted.http_client()
    restarted_http.configure_failpoints(
@@ -724,6 +720,14 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
        ]
    )

+    for info in shards:
+        pageserver = pageservers[int(info["node_id"])]
+        # the first request can cause these, but does not repeatedly
+        pageserver.allowed_errors.append(".*: request was dropped before completing")
+
+    # first request again
+    env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
+
    target = env.storage_controller.pageserver_api()

    with pytest.raises(ReadTimeout):