storage controller: fix retries of compute hook notifications while a secondary node is offline (#10352)

## Problem We would sometimes fail to retry compute notifications: 1. Try and send, set compute_notify_failure if we can't 2. On next reconcile, reconcile() fails for some other reason (e.g. tried to talk to an offline node), and we fail the `result.is_ok() && must_notify` condition around the re-sending. Closes: https://github.com/neondatabase/cloud/issues/22612 ## Summary of changes - Clarify the meaning of the reconcile result: it should be Ok(()) if configuring attached location worked, even if secondary or detach locations cannot be reached. - Skip trying to talk to secondaries if they're offline - Even if reconcile fails and we can't send the compute notification (we can't send it because we're not sure if it's really attached), make sure we save the `compute_notify_failure` flag so that subsequent reconciler runs will try again - Add a regression test for the above
2026-01-08 22:12:56 +00:00 · 2025-01-13 13:31:57 +00:00
parent 0b9032065e
commit d1bc36f536
3 changed files with 139 additions and 5 deletions
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -696,6 +696,11 @@ impl Reconciler {
    /// First we apply special case handling (e.g. for live migrations), and then a
    /// general case reconciliation where we walk through the intent by pageserver
    /// and call out to the pageserver to apply the desired state.
+    ///
+    /// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that
+    /// all locations for the tenant are in the expected state. When nodes that are to be detached
+    /// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a
+    /// state where it still requires later reconciliation.
    pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
        // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
        self.maybe_refresh_observed().await?;
@@ -784,10 +789,18 @@ impl Reconciler {
                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
                }
                _ => {
-                    // In all cases other than a matching observed configuration, we will
-                    // reconcile this location.
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
-                    changes.push((node.clone(), wanted_conf))
+                    // Only try and configure secondary locations on nodes that are available.  This
+                    // allows the reconciler to "succeed" while some secondaries are offline (e.g. after
+                    // a node failure, where the failed node will have a secondary intent)
+                    if node.is_available() {
+                        tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                        changes.push((node.clone(), wanted_conf))
+                    } else {
+                        tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
+                        self.observed
+                            .locations
+                            .insert(node.get_id(), ObservedStateLocation { conf: None });
+                    }
                }
            }
        }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1122,10 +1122,15 @@ impl TenantShard {
        let result = reconciler.reconcile().await;

        // If we know we had a pending compute notification from some previous action, send a notification irrespective
-        // of whether the above reconcile() did any work
+        // of whether the above reconcile() did any work.  It has to be Ok() though, because otherwise we might be
+        // sending a notification of a location that isn't really attached.
        if result.is_ok() && must_notify {
            // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
            reconciler.compute_notify().await.ok();
+        } else if must_notify {
+            // Carry this flag so that the reconciler's result will indicate that it still needs to retry
+            // the compute hook notification eventually.
+            reconciler.compute_notify_failure = true;
        }

        // Update result counter
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -822,6 +822,122 @@ def test_storage_controller_stuck_compute_hook(
    env.storage_controller.consistency_check()


+@run_only_on_default_postgres("postgres behavior is not relevant")
+def test_storage_controller_compute_hook_retry(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address: ListenAddress,
+):
+    """
+    Test that when a reconciler can't do its compute hook notification, it will keep
+    trying until it succeeds.
+
+    Reproducer for https://github.com/neondatabase/cloud/issues/22612
+    """
+
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    handle_params = {"status": 200}
+
+    notifications = []
+
+    def handler(request: Request):
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
+        notifications.append(request.json)
+        return Response(status=status)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.create_tenant(tenant_id, placement_policy='{"Attached": 1}')
+
+    # Initial notification from tenant creation
+    assert len(notifications) == 1
+    expect: dict[str, list[dict[str, int]] | str | None | int] = {
+        "tenant_id": str(tenant_id),
+        "stripe_size": None,
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+        "preferred_az": DEFAULT_AZ_ID,
+    }
+    assert notifications[0] == expect
+
+    # Block notifications, and fail a node
+    handle_params["status"] = 423
+    env.pageservers[0].stop()
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+
+    # Avoid waiting for heartbeats
+    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"})
+
+    # Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later,
+    # and we will check that that happens
+    notifications = []
+    try:
+        assert env.storage_controller.reconcile_all() == 1
+    except StorageControllerApiException as e:
+        assert "Control plane tenant busy" in str(e)
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # Try reconciling again, it should try notifying again
+    notifications = []
+    try:
+        assert env.storage_controller.reconcile_all() == 1
+    except StorageControllerApiException as e:
+        assert "Control plane tenant busy" in str(e)
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # The describe API should indicate that a notification is pending
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is True
+    )
+
+    # Unblock notifications: reconcile should work now
+    handle_params["status"] = 200
+    notifications = []
+    assert env.storage_controller.reconcile_all() == 1
+    assert len(notifications) == 1
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is False
+    )
+
+    # Reconciler should be idle now that it succeeded in its compute notification
+    notifications = []
+    assert env.storage_controller.reconcile_all() == 0
+    assert len(notifications) == 0
+    assert (
+        env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "is_pending_compute_notification"
+        ]
+        is False
+    )
+
+
@run_only_on_default_postgres("this test doesn't start an endpoint")
 def test_storage_controller_compute_hook_revert(
    httpserver: HTTPServer,