From d1bc36f53688452dfacb7e38c546d49fce73c9f3 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 13 Jan 2025 13:31:57 +0000 Subject: [PATCH] storage controller: fix retries of compute hook notifications while a secondary node is offline (#10352) ## Problem We would sometimes fail to retry compute notifications: 1. Try and send, set compute_notify_failure if we can't 2. On next reconcile, reconcile() fails for some other reason (e.g. tried to talk to an offline node), and we fail the `result.is_ok() && must_notify` condition around the re-sending. Closes: https://github.com/neondatabase/cloud/issues/22612 ## Summary of changes - Clarify the meaning of the reconcile result: it should be Ok(()) if configuring attached location worked, even if secondary or detach locations cannot be reached. - Skip trying to talk to secondaries if they're offline - Even if reconcile fails and we can't send the compute notification (we can't send it because we're not sure if it's really attached), make sure we save the `compute_notify_failure` flag so that subsequent reconciler runs will try again - Add a regression test for the above --- storage_controller/src/reconciler.rs | 21 +++- storage_controller/src/tenant_shard.rs | 7 +- .../regress/test_storage_controller.py | 116 ++++++++++++++++++ 3 files changed, 139 insertions(+), 5 deletions(-) diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index e0a854fff7..6f584e7267 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -696,6 +696,11 @@ impl Reconciler { /// First we apply special case handling (e.g. for live migrations), and then a /// general case reconciliation where we walk through the intent by pageserver /// and call out to the pageserver to apply the desired state. + /// + /// An Ok(()) result indicates that we successfully attached the tenant, but _not_ that + /// all locations for the tenant are in the expected state. When nodes that are to be detached + /// or configured as secondary are unavailable, we may return Ok(()) but leave the shard in a + /// state where it still requires later reconciliation. pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it self.maybe_refresh_observed().await?; @@ -784,10 +789,18 @@ impl Reconciler { tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") } _ => { - // In all cases other than a matching observed configuration, we will - // reconcile this location. - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); - changes.push((node.clone(), wanted_conf)) + // Only try and configure secondary locations on nodes that are available. This + // allows the reconciler to "succeed" while some secondaries are offline (e.g. after + // a node failure, where the failed node will have a secondary intent) + if node.is_available() { + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + changes.push((node.clone(), wanted_conf)) + } else { + tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable"); + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + } } } } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index c17989a316..e0a71b5822 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1122,10 +1122,15 @@ impl TenantShard { let result = reconciler.reconcile().await; // If we know we had a pending compute notification from some previous action, send a notification irrespective - // of whether the above reconcile() did any work + // of whether the above reconcile() did any work. It has to be Ok() though, because otherwise we might be + // sending a notification of a location that isn't really attached. if result.is_ok() && must_notify { // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] reconciler.compute_notify().await.ok(); + } else if must_notify { + // Carry this flag so that the reconciler's result will indicate that it still needs to retry + // the compute hook notification eventually. + reconciler.compute_notify_failure = true; } // Update result counter diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index da6d5b8622..616aee758d 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -822,6 +822,122 @@ def test_storage_controller_stuck_compute_hook( env.storage_controller.consistency_check() +@run_only_on_default_postgres("postgres behavior is not relevant") +def test_storage_controller_compute_hook_retry( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address: ListenAddress, +): + """ + Test that when a reconciler can't do its compute hook notification, it will keep + trying until it succeeds. + + Reproducer for https://github.com/neondatabase/cloud/issues/22612 + """ + + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + handle_params = {"status": 200} + + notifications = [] + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.create_tenant(tenant_id, placement_policy='{"Attached": 1}') + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: dict[str, list[dict[str, int]] | str | None | int] = { + "tenant_id": str(tenant_id), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + "preferred_az": DEFAULT_AZ_ID, + } + assert notifications[0] == expect + + # Block notifications, and fail a node + handle_params["status"] = 423 + env.pageservers[0].stop() + env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) + env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) + + # Avoid waiting for heartbeats + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + # Make reconciler run and fail: it should leave itself in a state where the shard will retry notification later, + # and we will check that that happens + notifications = [] + try: + assert env.storage_controller.reconcile_all() == 1 + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # Try reconciling again, it should try notifying again + notifications = [] + try: + assert env.storage_controller.reconcile_all() == 1 + except StorageControllerApiException as e: + assert "Control plane tenant busy" in str(e) + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # The describe API should indicate that a notification is pending + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is True + ) + + # Unblock notifications: reconcile should work now + handle_params["status"] = 200 + notifications = [] + assert env.storage_controller.reconcile_all() == 1 + assert len(notifications) == 1 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is False + ) + + # Reconciler should be idle now that it succeeded in its compute notification + notifications = [] + assert env.storage_controller.reconcile_all() == 0 + assert len(notifications) == 0 + assert ( + env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "is_pending_compute_notification" + ] + is False + ) + + @run_only_on_default_postgres("this test doesn't start an endpoint") def test_storage_controller_compute_hook_revert( httpserver: HTTPServer,