From 9e23410074b0d48a923fc0f2cc7dabee3bfd41ff Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 23 Jul 2024 21:09:05 +0100 Subject: [PATCH] tests: allow-list a controller heartbeat error (#8471) ## Problem `test_change_pageserver` stops pageservers in a way that can overlap with the controller's heartbeats: the controller can get a heartbeat success and then immediately find the node unavailable. This particular situation triggers a log that isn't in our current allow-list of messages for nodes offline Example: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8339/10048487700/index.html#testresult/19678f27810231df/retries ## Summary of changes - Add the message to the allow list --- storage_controller/src/service.rs | 2 ++ test_runner/fixtures/pageserver/allowed_errors.py | 1 + 2 files changed, 3 insertions(+) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index a163453dca..2a6d5d3578 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -828,6 +828,8 @@ impl Service { ); } Err(err) => { + // Transition to active involves reconciling: if a node responds to a heartbeat then + // becomes unavailable again, we may get an error here. tracing::error!( "Failed to update node {} after heartbeat round: {}", node_id, diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index c5b09e3608..dff002bd4b 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -102,6 +102,7 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ # failing to connect to them. ".*Call to node.*management API.*failed.*receive body.*", ".*Call to node.*management API.*failed.*ReceiveBody.*", + ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode