storage controller: be more tolerant of control plane blocking notifications (#7268)

## Problem - Control plane can deadlock if it calls into a function that requires reconciliation to complete, while refusing compute notification hooks API calls. ## Summary of changes - Fail faster in the notify path in 438 errors: these were originally expected to be transient, but in practice it's more common that a 438 results from an operation blocking on the currently API call, rather than something happening in the background. - In ensure_attached, relax the condition for spawning a reconciler: instead of just the general maybe_reconcile path, do a pre-check that skips trying to reconcile if the shard appears to be attached. This avoids doing work in cases where the tenant is attached, but is dirty from a reconciliation point of view, e.g. due to a failed compute notification.
2026-01-06 21:12:55 +00:00 · 2024-03-28 17:38:08 +00:00
parent 90be79fcf5
commit 39d1818ae9
3 changed files with 50 additions and 13 deletions
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -14,7 +14,6 @@ use utils::{

 use crate::service::Config;

-const BUSY_DELAY: Duration = Duration::from_secs(1);
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);

 pub(crate) const API_CONCURRENCY: usize = 32;
@@ -280,11 +279,10 @@ impl ComputeHook {
                Err(NotifyError::SlowDown)
            }
            StatusCode::LOCKED => {
-                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
-                // is not appropriate
-                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
-                    .await
-                    .ok();
+                // We consider this fatal, because it's possible that the operation blocking the control one is
+                // also the one that is waiting for this reconcile.  We should let the reconciler calling
+                // this hook fail, to give control plane a chance to un-lock.
+                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
                Err(NotifyError::Busy)
            }
            StatusCode::SERVICE_UNAVAILABLE
@@ -306,7 +304,12 @@ impl ComputeHook {
        let client = reqwest::Client::new();
        backoff::retry(
            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
-            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
+            |e| {
+                matches!(
+                    e,
+                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
+                )
+            },
            3,
            10,
            "Send compute notification",
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -3936,9 +3936,6 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
-    ///
-    /// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
-    /// an attached policy.  We should error out if it isn't.
    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
@@ -3947,10 +3944,26 @@ impl Service {
        let mut waiters = Vec::new();
        let (nodes, tenants, scheduler) = locked.parts_mut();

-        for (_tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+        for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
            shard.schedule(scheduler)?;

+            // The shard's policies may not result in an attached location being scheduled: this
+            // is an error because our caller needs it attached somewhere.
+            if shard.intent.get_attached().is_none() {
+                return Err(anyhow::anyhow!(
+                    "Tenant {tenant_id} not scheduled to be attached"
+                ));
+            };
+
+            if shard.stably_attached().is_some() {
+                // We do not require the shard to be totally up to date on reconciliation: we just require
+                // that it has been attached on the intended node.   Other dirty state such as unattached secondary
+                // locations, or compute hook notifications can be ignored.
+                continue;
+            }
+
            if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached");
                waiters.push(waiter);
            }
        }
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -433,10 +433,13 @@ def test_sharding_service_compute_hook(
    # Set up fake HTTP notify endpoint
    notifications = []

+    handle_params = {"status": 200}
+
    def handler(request: Request):
-        log.info(f"Notify request: {request}")
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
        notifications.append(request.json)
-        return Response(status=200)
+        return Response(status=status)

    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)

@@ -504,6 +507,24 @@ def test_sharding_service_compute_hook(

    wait_until(10, 1, received_split_notification)

+    # If the compute hook is unavailable, that should not block creating a tenant and
+    # creating a timeline.  This simulates a control plane refusing to accept notifications
+    handle_params["status"] = 423
+    degraded_tenant_id = TenantId.generate()
+    degraded_timeline_id = TimelineId.generate()
+    env.storage_controller.tenant_create(degraded_tenant_id)
+    env.storage_controller.pageserver_api().timeline_create(
+        PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id
+    )
+
+    # Ensure we hit the handler error path
+    env.storage_controller.allowed_errors.append(
+        ".*Failed to notify compute of attached pageserver.*tenant busy.*"
+    )
+    env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*")
+    assert notifications[-1] is not None
+    assert notifications[-1]["tenant_id"] == str(degraded_tenant_id)
+
    env.storage_controller.consistency_check()