storcon: silence cplane 404s on tenant creation (#10665)

## Problem

We get WARN log noise on tenant creations. Cplane creates tenants via
/location_config. That returns the attached locations in the response
and spawns a reconciliation which will also attempt to notify cplane. If
the notification is attempted before cplane persists the shards to its
database, storcon gets back a 404. The situation is harmless, but
annoying.

## Summary of Changes

* Add a tenant creation hint to the reconciler config
* If the hint is true and we get back a 404 on the notification from
cplane, ignore the error, but still queue the reconcile up for a retry.

Closes https://github.com/neondatabase/cloud/issues/20732
This commit is contained in:
Vlad Lazar
2025-02-05 12:41:09 +00:00
committed by GitHub
parent 472007dd7c
commit 47975d06d9
3 changed files with 50 additions and 9 deletions

View File

@@ -225,7 +225,7 @@ pub(crate) enum NotifyError {
// We shutdown while sending
#[error("Shutting down")]
ShuttingDown,
// A response indicates we will never succeed, such as 400 or 404
// A response indicates we will never succeed, such as 400 or 403
#[error("Non-retryable error {0}")]
Fatal(StatusCode),

View File

@@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder {
}
}
pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self {
Self {
config: ReconcilerConfig {
tenant_creation_hint: hint,
..self.config
},
}
}
pub(crate) fn build(self) -> ReconcilerConfig {
self.config
}
@@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig {
// During live migrations this is the amount of time that
// the pagserver will hold our poll.
secondary_download_request_timeout: Option<Duration>,
// A hint indicating whether this reconciliation is done on the
// creation of a new tenant. This only informs logging behaviour.
tenant_creation_hint: bool,
}
impl ReconcilerConfig {
@@ -143,6 +156,10 @@ impl ReconcilerConfig {
self.secondary_download_request_timeout
.unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT)
}
pub(crate) fn tenant_creation_hint(&self) -> bool {
self.tenant_creation_hint
}
}
/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
@@ -934,16 +951,35 @@ impl Reconciler {
)
.await;
if let Err(e) = &result {
// It is up to the caller whether they want to drop out on this error, but they don't have to:
// in general we should avoid letting unavailability of the cloud control plane stop us from
// making progress.
if !matches!(e, NotifyError::ShuttingDown) {
tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
}
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it
// needs to retry at some point.
self.compute_notify_failure = true;
// It is up to the caller whether they want to drop out on this error, but they don't have to:
// in general we should avoid letting unavailability of the cloud control plane stop us from
// making progress.
match e {
// 404s from cplane during tenant creation are expected.
// Cplane only persists the shards to the database after
// creating the tenant and the timeline. If we notify before
// that, we'll get a 404.
//
// This is fine because tenant creations happen via /location_config
// and that returns the list of locations in the response. Hence, we
// silence the error and return Ok(()) here. Reconciliation will still
// be retried because we set [`Reconciler::compute_notify_failure`] above.
NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND)
if self.reconciler_config.tenant_creation_hint() =>
{
return Ok(());
}
NotifyError::ShuttingDown => {}
_ => {
tracing::warn!(
"Failed to notify compute of attached pageserver {node}: {e}"
);
}
}
}
result
} else {

View File

@@ -2238,9 +2238,14 @@ impl Service {
let waiters = {
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, _scheduler) = locked.parts_mut();
let config = ReconcilerConfigBuilder::new()
.tenant_creation_hint(true)
.build();
tenants
.range_mut(TenantShardId::tenant_range(tenant_id))
.filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes))
.filter_map(|(_shard_id, shard)| {
self.maybe_configured_reconcile_shard(shard, nodes, config)
})
.collect::<Vec<_>>()
};