From ff174a88c0544d1270a6f993810d67a6eb4cc0b2 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 18 Jul 2024 00:03:02 +0300 Subject: [PATCH] test: allow requests to any pageserver get cancelled (#8413) Fix flakyness on `test_sharded_timeline_detach_ancestor` which does not reproduce on a fast enough runner by allowing cancelled request before completing on all pageservers. It was only allowed on half of the pageservers. Failure evidence: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8352/9972357040/index.html#suites/a1c2be32556270764423c495fad75d47/7cca3e3d94fe12f2 --- .../regress/test_timeline_detach_ancestor.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index d75ab4c060..38f8dfa885 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -702,20 +702,16 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): # make another of the nodes get stuck, then restart stuck = pageservers[int(shards[0]["node_id"])] - stuck.allowed_errors.append(".*: request was dropped before completing") - env.storage_controller.allowed_errors.append(".*: request was dropped before completing") + log.info(f"stuck pageserver is id={stuck.id}") stuck_http = stuck.http_client() stuck_http.configure_failpoints( ("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause") ) restarted = pageservers[int(shards[1]["node_id"])] - restarted.allowed_errors.extend( - [ - ".*: request was dropped before completing", - ".*: Cancelled request finished with an error: ShuttingDown", - ] - ) + log.info(f"restarted pageserver is id={restarted.id}") + # this might be hit; see `restart_restarted` + restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown") assert restarted.id != stuck.id restarted_http = restarted.http_client() restarted_http.configure_failpoints( @@ -724,6 +720,14 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): ] ) + for info in shards: + pageserver = pageservers[int(info["node_id"])] + # the first request can cause these, but does not repeatedly + pageserver.allowed_errors.append(".*: request was dropped before completing") + + # first request again + env.storage_controller.allowed_errors.append(".*: request was dropped before completing") + target = env.storage_controller.pageserver_api() with pytest.raises(ReadTimeout):