diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 2c42da4043..1e7d7adffe 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -572,30 +572,7 @@ impl Reconciler {
 
         // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach
         // the origin without notifying compute, we will render the tenant unavailable.
-        let mut notify_attempts = 0;
-        while let Err(e) = self.compute_notify().await {
-            match e {
-                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
-                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
-                _ => {
-                    tracing::warn!(
-                        "Live migration blocked by compute notification error, retrying: {e}"
-                    );
-                }
-            }
-
-            exponential_backoff(
-                notify_attempts,
-                // Generous waits: control plane operations which might be blocking us usually complete on the order
-                // of hundreds to thousands of milliseconds, so no point busy polling.
-                1.0,
-                10.0,
-                &self.cancel,
-            )
-            .await;
-            notify_attempts += 1;
-        }
-
+        self.compute_notify_blocking(&origin_ps).await?;
         pausable_failpoint!("reconciler-live-migrate-post-notify");
 
         // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
@@ -869,6 +846,117 @@ impl Reconciler {
             Ok(())
         }
     }
+
+    /// Keep trying to notify the compute indefinitely, only dropping out if:
+    /// - the node `origin` becomes unavailable -> Ok(())
+    /// - the node `origin` no longer has our tenant shard attached -> Ok(())
+    /// - our cancellation token fires -> Err(ReconcileError::Cancelled)
+    ///
+    /// This is used during live migration, where we do not wish to detach
+    /// an origin location until the compute definitely knows about the new
+    /// location.
+    ///
+    /// In cases where the origin node becomes unavailable, we return success, indicating
+    /// to the caller that they should continue irrespective of whether the compute was notified,
+    /// because the origin node is unusable anyway.  Notification will be retried later via the
+    /// [`Self::compute_notify_failure`] flag.
+    async fn compute_notify_blocking(&mut self, origin: &Node) -> Result<(), ReconcileError> {
+        let mut notify_attempts = 0;
+        while let Err(e) = self.compute_notify().await {
+            match e {
+                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
+                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
+                _ => {
+                    tracing::warn!(
+                        "Live migration blocked by compute notification error, retrying: {e}"
+                    );
+                }
+            }
+
+            // Did the origin pageserver become unavailable?
+            if !origin.is_available() {
+                tracing::info!("Giving up on compute notification because {origin} is unavailable");
+                break;
+            }
+
+            // Does the origin pageserver still host the shard we are interested in?  We should only
+            // continue waiting for compute notification to be acked if the old location is still usable.
+            let tenant_shard_id = self.tenant_shard_id;
+            match origin
+                .with_client_retries(
+                    |client| async move { client.get_location_config(tenant_shard_id).await },
+                    &self.service_config.jwt_token,
+                    1,
+                    3,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(Some(location_conf))) => {
+                    if matches!(
+                        location_conf.mode,
+                        LocationConfigMode::AttachedMulti
+                            | LocationConfigMode::AttachedSingle
+                            | LocationConfigMode::AttachedStale
+                    ) {
+                        tracing::debug!(
+                            "Still attached to {origin}, will wait & retry compute notification"
+                        );
+                    } else {
+                        tracing::info!(
+                            "Giving up on compute notification because {origin} is in state {:?}",
+                            location_conf.mode
+                        );
+                        return Ok(());
+                    }
+                    // Fall through
+                }
+                Some(Ok(None)) => {
+                    tracing::info!(
+                        "No longer attached to {origin}, giving up on compute notification"
+                    );
+                    return Ok(());
+                }
+                Some(Err(e)) => {
+                    match e {
+                        mgmt_api::Error::Cancelled => {
+                            tracing::info!(
+                                "Giving up on compute notification because {origin} is unavailable"
+                            );
+                            return Ok(());
+                        }
+                        mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _) => {
+                            tracing::info!(
+                                "No longer attached to {origin}, giving up on compute notification"
+                            );
+                            return Ok(());
+                        }
+                        e => {
+                            // Other API errors are unexpected here.
+                            tracing::warn!("Unexpected error checking location on {origin}: {e}");
+
+                            // Fall through, we will retry compute notification.
+                        }
+                    }
+                }
+                None => return Err(ReconcileError::Cancel),
+            };
+
+            exponential_backoff(
+                notify_attempts,
+                // Generous waits: control plane operations which might be blocking us usually complete on the order
+                // of hundreds to thousands of milliseconds, so no point busy polling.
+                1.0,
+                10.0,
+                &self.cancel,
+            )
+            .await;
+            notify_attempts += 1;
+        }
+
+        Ok(())
+    }
 }
 
 /// We tweak the externally-set TenantConfig while configuring
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a5e0129684..851db97310 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4974,7 +4974,12 @@ impl Service {
 
             {
                 let mut nodes_mut = (**nodes).clone();
-                nodes_mut.remove(&node_id);
+                if let Some(mut removed_node) = nodes_mut.remove(&node_id) {
+                    // Ensure that any reconciler holding an Arc<> to this node will
+                    // drop out when trying to RPC to it (setting Offline state sets the
+                    // cancellation token on the Node object).
+                    removed_node.set_availability(NodeAvailability::Offline);
+                }
                 *nodes = Arc::new(nodes_mut);
             }
         }
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 3861f0b822..789623cb27 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -567,6 +567,149 @@ def test_storage_controller_compute_hook(
     env.storage_controller.consistency_check()
 
 
+def test_storage_controller_stuck_compute_hook(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+):
+    """
+    Test the migration process's behavior when the compute hook does not enable it to proceed
+    """
+
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    handle_params = {"status": 200}
+
+    notifications = []
+
+    def handler(request: Request):
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
+        notifications.append(request.json)
+        return Response(status=status)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
+
+    # Initial notification from tenant creation
+    assert len(notifications) == 1
+    expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
+        "tenant_id": str(env.initial_tenant),
+        "stripe_size": None,
+        "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}],
+    }
+    assert notifications[0] == expect
+
+    # Do a migration while the compute hook is returning 423 status
+    tenant_id = env.initial_tenant
+    origin_pageserver = env.get_tenant_pageserver(tenant_id)
+    dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
+    dest_pageserver = env.get_pageserver(dest_ps_id)
+    shard_0_id = TenantShardId(tenant_id, 0, 0)
+
+    NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*"
+    env.storage_controller.allowed_errors.extend(
+        [
+            NOTIFY_BLOCKED_LOG,
+            ".*Failed to notify compute.*",
+            ".*Reconcile error.*Cancelled",
+            ".*Reconcile error.*Control plane tenant busy",
+        ]
+    )
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+        # We expect the controller to hit the 423 (locked) and retry.  Migration shouldn't complete until that
+        # status is cleared.
+        handle_params["status"] = 423
+        migrate_fut = executor.submit(
+            env.storage_controller.tenant_shard_migrate, shard_0_id, dest_ps_id
+        )
+
+        def logged_stuck():
+            env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG)
+
+        wait_until(10, 0.25, logged_stuck)
+        contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG)
+        assert contains_r is not None  # Appease mypy
+        (_, log_cursor) = contains_r
+        assert migrate_fut.running()
+
+        # Permit the compute hook to proceed
+        handle_params["status"] = 200
+        migrate_fut.result(timeout=10)
+
+        # Advance log cursor past the last 'stuck' message (we already waited for one, but
+        # there could be more than one)
+        while True:
+            contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor)
+            if contains_r is None:
+                break
+            else:
+                (_, log_cursor) = contains_r
+
+        # Now, do a migration in the opposite direction
+        handle_params["status"] = 423
+        migrate_fut = executor.submit(
+            env.storage_controller.tenant_shard_migrate, shard_0_id, origin_pageserver.id
+        )
+
+        def logged_stuck_again():
+            env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor)
+
+        wait_until(10, 0.25, logged_stuck_again)
+        assert migrate_fut.running()
+
+        # This time, the compute hook remains stuck, but we mark the origin node offline: this should
+        # also allow the migration to complete -- we only wait for the compute hook as long as we think
+        # the old location is still usable for computes.
+        # This is a regression test for issue https://github.com/neondatabase/neon/issues/8901
+        dest_pageserver.stop()
+        env.storage_controller.node_configure(dest_ps_id, {"availability": "Offline"})
+
+        try:
+            migrate_fut.result(timeout=10)
+        except StorageControllerApiException as e:
+            # The reconciler will fail because it can't detach from the origin: the important
+            # thing is that it finishes, rather than getting stuck in the compute notify loop.
+            assert "Reconcile error" in str(e)
+
+        # A later background reconciliation will clean up and leave things in a neat state, even
+        # while the compute hook is still blocked
+        try:
+            env.storage_controller.reconcile_all()
+        except StorageControllerApiException as e:
+            # We expect that the reconciler will do its work, but be unable to fully succeed
+            # because it can't send a compute notification.  It will complete, but leave
+            # the internal flag set for "retry compute notification later"
+            assert "Control plane tenant busy" in str(e)
+
+        # Confirm that we are AttachedSingle on the node we last called the migrate API for
+        loc = origin_pageserver.http_client().tenant_get_location(shard_0_id)
+        assert loc["mode"] == "AttachedSingle"
+
+        # When the origin node comes back, it should get cleaned up
+        dest_pageserver.start()
+        try:
+            env.storage_controller.reconcile_all()
+        except StorageControllerApiException as e:
+            # Compute hook is still blocked: reconciler will configure PS but not fully succeed
+            assert "Control plane tenant busy" in str(e)
+
+        with pytest.raises(PageserverApiException, match="Tenant shard not found"):
+            dest_pageserver.http_client().tenant_get_location(shard_0_id)
+
+        # Once the compute hook is unblocked, we should be able to get into a totally
+        # quiescent state again
+        handle_params["status"] = 200
+        env.storage_controller.reconcile_until_idle()
+
+    env.storage_controller.consistency_check()
+
+
 def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     """
     Verify that occasional-use debug APIs work as expected.  This is a lightweight test