From 84b7588293a2091e6295f0a9db66dd2ae6ff3237 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sat, 23 Mar 2024 13:33:54 +0000 Subject: [PATCH] test_lazy_attach_activation: unblock failpoints before test exit Before this patch, we would leave the `timeline-calculate-logical-size-pause` failpoint in `pause` mode at the end of the test. With the switch to a single runtime, somehow we'd end up in a place where the pageserver was half shut down while the failpoint spawn_blocking thread was waiting for the `off` event that never arrived. Failures were reproducible quite well in CI: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-6555/8396322235/index.html#suites/c39429f093f87547b2a3b0943e2522d9/4dacb1efb232b98/ I couldn't repro it locally. I managed to repro it once on an i3en.3xlarge , where I then attached gdb to capture the backtrace. For posterity: https://www.notion.so/neondatabase/debug-test_lazy_attach_activation-teardown-hang-as-part-of-PR-6555-421cb61dc45d4d4e90220c86567f50da?pvs=4 --- test_runner/regress/test_timeline_size.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 628c484fbd..163a41c5ce 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1057,9 +1057,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met env.pageserver.stop() # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + paused_failpoints = ["timeline-calculate-logical-size-pause", "walreceiver-after-ingest"] env.pageserver.start( extra_env_vars={ - "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + "FAILPOINTS": ";".join([f"{fp}=pause" for fp in paused_failpoints]), } ) @@ -1111,3 +1112,6 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) else: raise RuntimeError(activation_method) + + ps_http = env.pageserver.http_client() + ps_http.configure_failpoints([(fp, "off") for fp in paused_failpoints])