test: allow requests to any pageserver get cancelled (#8413)

Fix flakyness on `test_sharded_timeline_detach_ancestor` which does not
reproduce on a fast enough runner by allowing cancelled request before
completing on all pageservers. It was only allowed on half of the
pageservers.

Failure evidence:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8352/9972357040/index.html#suites/a1c2be32556270764423c495fad75d47/7cca3e3d94fe12f2
This commit is contained in:
Joonas Koivunen
2024-07-18 00:03:02 +03:00
committed by GitHub
parent 0c236fa465
commit e250b9e063

View File

@@ -702,20 +702,16 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
# make another of the nodes get stuck, then restart
stuck = pageservers[int(shards[0]["node_id"])]
stuck.allowed_errors.append(".*: request was dropped before completing")
env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
log.info(f"stuck pageserver is id={stuck.id}")
stuck_http = stuck.http_client()
stuck_http.configure_failpoints(
("timeline-detach-ancestor::before_starting_after_locking_pausable", "pause")
)
restarted = pageservers[int(shards[1]["node_id"])]
restarted.allowed_errors.extend(
[
".*: request was dropped before completing",
".*: Cancelled request finished with an error: ShuttingDown",
]
)
log.info(f"restarted pageserver is id={restarted.id}")
# this might be hit; see `restart_restarted`
restarted.allowed_errors.append(".*: Cancelled request finished with an error: ShuttingDown")
assert restarted.id != stuck.id
restarted_http = restarted.http_client()
restarted_http.configure_failpoints(
@@ -724,6 +720,14 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
]
)
for info in shards:
pageserver = pageservers[int(info["node_id"])]
# the first request can cause these, but does not repeatedly
pageserver.allowed_errors.append(".*: request was dropped before completing")
# first request again
env.storage_controller.allowed_errors.append(".*: request was dropped before completing")
target = env.storage_controller.pageserver_api()
with pytest.raises(ReadTimeout):