tests: allow-list neon_local endpoint errors from storage controller (#8123)

## Problem

For testing, the storage controller has a built-in hack that loads
neon_local endpoint config from disk, and uses it to reconfigure
endpoints when the attached pageserver changes.

Some tests that stop an endpoint while the storage controller is running
could occasionally fail on log errors from the controller trying to use
its special test-mode calls into neon local Endpoint.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8117/9592392425/index.html#/testresult/9d2bb8623d0d53f8

## Summary of changes

- Give NotifyError an explicit NeonLocal variant, to avoid munging these
into generic 500s (I don't want to ignore 500s in general)
- Allow-list errors related to the local notification hook.

The expectation is that tests using endpoints/workloads should be
independently checking that those endpoints work: if neon_local
generates an error inside the storage controller, that's ignorable.
This commit is contained in:
John Spray
2024-06-21 18:23:31 +01:00
committed by GitHub
parent ee3081863e
commit b74232eb4d
2 changed files with 12 additions and 3 deletions

View File

@@ -146,6 +146,9 @@ pub(crate) enum NotifyError {
// A response indicates we will never succeed, such as 400 or 404
#[error("Non-retryable error {0}")]
Fatal(StatusCode),
#[error("neon_local error: {0}")]
NeonLocal(anyhow::Error),
}
enum MaybeSendResult {
@@ -278,7 +281,7 @@ impl ComputeHook {
async fn do_notify_local(
&self,
reconfigure_request: &ComputeHookNotifyRequest,
) -> anyhow::Result<()> {
) -> Result<(), NotifyError> {
// neon_local updates are not safe to call concurrently, use a lock to serialize
// all calls to this function
let _locked = self.neon_local_lock.lock().await;
@@ -321,7 +324,8 @@ impl ComputeHook {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint
.reconfigure(compute_pageservers.clone(), *stripe_size)
.await?;
.await
.map_err(NotifyError::NeonLocal)?;
}
}
@@ -510,7 +514,7 @@ impl ComputeHook {
} else {
self.do_notify_local(&request).await.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
tracing::error!("neon_local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
};

View File

@@ -106,6 +106,11 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
".*startup_reconcile: Could not scan node.*",
# Tests run in dev mode
".*Starting in dev mode.*",
# Tests that stop endpoints & use the storage controller's neon_local notification
# mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
# controller's attempts to notify the endpoint).
".*reconciler.*neon_local notification hook failed.*",
".*reconciler.*neon_local error.*",
]