mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
tests: allow-list neon_local endpoint errors from storage controller (#8123)
## Problem For testing, the storage controller has a built-in hack that loads neon_local endpoint config from disk, and uses it to reconfigure endpoints when the attached pageserver changes. Some tests that stop an endpoint while the storage controller is running could occasionally fail on log errors from the controller trying to use its special test-mode calls into neon local Endpoint. Example: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8117/9592392425/index.html#/testresult/9d2bb8623d0d53f8 ## Summary of changes - Give NotifyError an explicit NeonLocal variant, to avoid munging these into generic 500s (I don't want to ignore 500s in general) - Allow-list errors related to the local notification hook. The expectation is that tests using endpoints/workloads should be independently checking that those endpoints work: if neon_local generates an error inside the storage controller, that's ignorable.
This commit is contained in:
@@ -146,6 +146,9 @@ pub(crate) enum NotifyError {
|
||||
// A response indicates we will never succeed, such as 400 or 404
|
||||
#[error("Non-retryable error {0}")]
|
||||
Fatal(StatusCode),
|
||||
|
||||
#[error("neon_local error: {0}")]
|
||||
NeonLocal(anyhow::Error),
|
||||
}
|
||||
|
||||
enum MaybeSendResult {
|
||||
@@ -278,7 +281,7 @@ impl ComputeHook {
|
||||
async fn do_notify_local(
|
||||
&self,
|
||||
reconfigure_request: &ComputeHookNotifyRequest,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), NotifyError> {
|
||||
// neon_local updates are not safe to call concurrently, use a lock to serialize
|
||||
// all calls to this function
|
||||
let _locked = self.neon_local_lock.lock().await;
|
||||
@@ -321,7 +324,8 @@ impl ComputeHook {
|
||||
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
|
||||
endpoint
|
||||
.reconfigure(compute_pageservers.clone(), *stripe_size)
|
||||
.await?;
|
||||
.await
|
||||
.map_err(NotifyError::NeonLocal)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -510,7 +514,7 @@ impl ComputeHook {
|
||||
} else {
|
||||
self.do_notify_local(&request).await.map_err(|e| {
|
||||
// This path is for testing only, so munge the error into our prod-style error type.
|
||||
tracing::error!("Local notification hook failed: {e}");
|
||||
tracing::error!("neon_local notification hook failed: {e}");
|
||||
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
|
||||
})
|
||||
};
|
||||
|
||||
@@ -106,6 +106,11 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
".*startup_reconcile: Could not scan node.*",
|
||||
# Tests run in dev mode
|
||||
".*Starting in dev mode.*",
|
||||
# Tests that stop endpoints & use the storage controller's neon_local notification
|
||||
# mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
|
||||
# controller's attempts to notify the endpoint).
|
||||
".*reconciler.*neon_local notification hook failed.*",
|
||||
".*reconciler.*neon_local error.*",
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user