impr(controller_upcall_client): clean up copy-pasta code & add context to retries (#10991)

Before this PR, re-attach and validate would log the same warning
```
calling control plane generation validation API failed
```
on retry errors.

This can be confusing.

This PR makes the message generically valid for any upcall and adds
additional tracing spans to capture context.

Along the way, clean up some copy-pasta variable naming.

refs
-
https://github.com/neondatabase/neon/issues/10381#issuecomment-2684755827

---------

Co-authored-by: Alexander Lakhin <alexander.lakhin@neon.tech>
This commit is contained in:
Christian Schwarz
2025-02-27 11:59:43 +01:00
committed by GitHub
parent 3a3d62dc4f
commit e35f7758d8
2 changed files with 11 additions and 7 deletions

View File

@@ -84,6 +84,7 @@ impl ControllerUpcallClient {
})
}
#[tracing::instrument(skip_all)]
async fn retry_http_forever<R, T>(
&self,
url: &url::Url,
@@ -108,7 +109,7 @@ impl ControllerUpcallClient {
|_| false,
3,
u32::MAX,
"calling control plane generation validation API",
"storage controller upcall",
&self.cancel,
)
.await
@@ -125,11 +126,12 @@ impl ControllerUpcallClient {
impl ControlPlaneGenerationsApi for ControllerUpcallClient {
/// Block until we get a successful response, or error out if we are shut down
#[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
async fn re_attach(
&self,
conf: &PageServerConf,
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
let re_attach_path = self
let url = self
.base_url
.join("re-attach")
.expect("Failed to build re-attach path");
@@ -205,7 +207,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
register: register.clone(),
};
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
let response: ReAttachResponse = self.retry_http_forever(&url, request).await?;
tracing::info!(
"Received re-attach response with {} tenants (node {}, register: {:?})",
response.tenants.len(),
@@ -223,11 +225,12 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
}
/// Block until we get a successful response, or error out if we are shut down
#[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
let re_attach_path = self
let url = self
.base_url
.join("validate")
.expect("Failed to build validate path");
@@ -257,8 +260,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
return Err(RetryForeverError::ShuttingDown);
}
let response: ValidateResponse =
self.retry_http_forever(&re_attach_path, request).await?;
let response: ValidateResponse = self.retry_http_forever(&url, request).await?;
for rt in response.tenants {
result.insert(rt.id, rt.valid);
}

View File

@@ -94,7 +94,9 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
".*Flushed oversized open layer with size.*",
# During teardown, we stop the storage controller before the pageservers, so pageservers
# can experience connection errors doing background deletion queue work.
".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
".*WARN deletion backend:.* storage controller upcall failed, will retry.*error sending request.*",
# Can happen when the pageserver starts faster than the storage controller
".*WARN init_tenant_mgr:.* storage controller upcall failed, will retry.*error sending request.*",
# Can happen when the test shuts down the storage controller while it is calling the utilization API
".*WARN.*path=/v1/utilization .*request was dropped before completing",
# Can happen during shutdown