From b74232eb4d36ad16750a938c069da0dbfffed3ce Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 21 Jun 2024 18:23:31 +0100 Subject: [PATCH] tests: allow-list neon_local endpoint errors from storage controller (#8123) ## Problem For testing, the storage controller has a built-in hack that loads neon_local endpoint config from disk, and uses it to reconfigure endpoints when the attached pageserver changes. Some tests that stop an endpoint while the storage controller is running could occasionally fail on log errors from the controller trying to use its special test-mode calls into neon local Endpoint. Example: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8117/9592392425/index.html#/testresult/9d2bb8623d0d53f8 ## Summary of changes - Give NotifyError an explicit NeonLocal variant, to avoid munging these into generic 500s (I don't want to ignore 500s in general) - Allow-list errors related to the local notification hook. The expectation is that tests using endpoints/workloads should be independently checking that those endpoints work: if neon_local generates an error inside the storage controller, that's ignorable. --- storage_controller/src/compute_hook.rs | 10 +++++++--- test_runner/fixtures/pageserver/allowed_errors.py | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index a1d051f150..4d0f8006aa 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -146,6 +146,9 @@ pub(crate) enum NotifyError { // A response indicates we will never succeed, such as 400 or 404 #[error("Non-retryable error {0}")] Fatal(StatusCode), + + #[error("neon_local error: {0}")] + NeonLocal(anyhow::Error), } enum MaybeSendResult { @@ -278,7 +281,7 @@ impl ComputeHook { async fn do_notify_local( &self, reconfigure_request: &ComputeHookNotifyRequest, - ) -> anyhow::Result<()> { + ) -> Result<(), NotifyError> { // neon_local updates are not safe to call concurrently, use a lock to serialize // all calls to this function let _locked = self.neon_local_lock.lock().await; @@ -321,7 +324,8 @@ impl ComputeHook { tracing::info!("Reconfiguring endpoint {}", endpoint_name,); endpoint .reconfigure(compute_pageservers.clone(), *stripe_size) - .await?; + .await + .map_err(NotifyError::NeonLocal)?; } } @@ -510,7 +514,7 @@ impl ComputeHook { } else { self.do_notify_local(&request).await.map_err(|e| { // This path is for testing only, so munge the error into our prod-style error type. - tracing::error!("Local notification hook failed: {e}"); + tracing::error!("neon_local notification hook failed: {e}"); NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) }) }; diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 147d5705d3..c5b09e3608 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -106,6 +106,11 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode ".*Starting in dev mode.*", + # Tests that stop endpoints & use the storage controller's neon_local notification + # mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage + # controller's attempts to notify the endpoint). + ".*reconciler.*neon_local notification hook failed.*", + ".*reconciler.*neon_local error.*", ]