diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 3884a6df46..5bc3c81f02 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -225,7 +225,7 @@ pub(crate) enum NotifyError { // We shutdown while sending #[error("Shutting down")] ShuttingDown, - // A response indicates we will never succeed, such as 400 or 404 + // A response indicates we will never succeed, such as 400 or 403 #[error("Non-retryable error {0}")] Fatal(StatusCode), diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 03db947263..58bc0ba1cd 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -115,6 +115,15 @@ impl ReconcilerConfigBuilder { } } + pub(crate) fn tenant_creation_hint(self, hint: bool) -> Self { + Self { + config: ReconcilerConfig { + tenant_creation_hint: hint, + ..self.config + }, + } + } + pub(crate) fn build(self) -> ReconcilerConfig { self.config } @@ -129,6 +138,10 @@ pub(crate) struct ReconcilerConfig { // During live migrations this is the amount of time that // the pagserver will hold our poll. secondary_download_request_timeout: Option, + + // A hint indicating whether this reconciliation is done on the + // creation of a new tenant. This only informs logging behaviour. + tenant_creation_hint: bool, } impl ReconcilerConfig { @@ -143,6 +156,10 @@ impl ReconcilerConfig { self.secondary_download_request_timeout .unwrap_or(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT_DEFAULT) } + + pub(crate) fn tenant_creation_hint(&self) -> bool { + self.tenant_creation_hint + } } /// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O @@ -934,16 +951,35 @@ impl Reconciler { ) .await; if let Err(e) = &result { - // It is up to the caller whether they want to drop out on this error, but they don't have to: - // in general we should avoid letting unavailability of the cloud control plane stop us from - // making progress. - if !matches!(e, NotifyError::ShuttingDown) { - tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); - } - // Set this flag so that in our ReconcileResult we will set the flag on the shard that it // needs to retry at some point. self.compute_notify_failure = true; + + // It is up to the caller whether they want to drop out on this error, but they don't have to: + // in general we should avoid letting unavailability of the cloud control plane stop us from + // making progress. + match e { + // 404s from cplane during tenant creation are expected. + // Cplane only persists the shards to the database after + // creating the tenant and the timeline. If we notify before + // that, we'll get a 404. + // + // This is fine because tenant creations happen via /location_config + // and that returns the list of locations in the response. Hence, we + // silence the error and return Ok(()) here. Reconciliation will still + // be retried because we set [`Reconciler::compute_notify_failure`] above. + NotifyError::Unexpected(hyper::StatusCode::NOT_FOUND) + if self.reconciler_config.tenant_creation_hint() => + { + return Ok(()); + } + NotifyError::ShuttingDown => {} + _ => { + tracing::warn!( + "Failed to notify compute of attached pageserver {node}: {e}" + ); + } + } } result } else { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 9ac9ee17ca..4028cd7023 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2238,9 +2238,14 @@ impl Service { let waiters = { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, _scheduler) = locked.parts_mut(); + let config = ReconcilerConfigBuilder::new() + .tenant_creation_hint(true) + .build(); tenants .range_mut(TenantShardId::tenant_range(tenant_id)) - .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes)) + .filter_map(|(_shard_id, shard)| { + self.maybe_configured_reconcile_shard(shard, nodes, config) + }) .collect::>() };