From b0dfe0ffa66b68c99d5cfc99e62d0c81c95c09d2 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 25 Jul 2025 15:03:17 +0100
Subject: [PATCH] storcon: attempt all non-essential location config calls
 during reconciliations (#12745)

## Problem

We saw the following in the field:

Context and observations:
* The storage controller keeps track of the latest generations and the
pageserver that issued the latest generation in the database
* When the storage controller needs to proxy a request (e.g. timeline
creation) to the pageservers, it will find use the pageserver that
issued the latest generation from the db (generation_pageserver).
* pageserver-2.cell-2 got into a bad state and wasn't able to apply
location_config (e.g. detach a shard)

What happened:
1. pageserver-2.cell-2 was a secondary for our shard since we were not
able to detach it
2. control plane asked to detach a tenant (presumably because it was
idle)
a. In response storcon clears the generation_pageserver from the db and
attempts to detach all locations
b. it tries to detach pageserver-2.cell-2 first, but fails, which fails
the entire reconciliation leaving the good attached location still there
c. return success to cplane

3. control plane asks to re-attach the tenant
a. In response storcon performs a reconciliation
b. it finds that the observed state matches the intent (remember we did
not detach the primary at step(2))
c. skips incrementing the genration and setting the
generation_pageserver column

Now any requests that need to be proxied to pageservers and rely on the
generation_pageserver db column fail because that's not set

## Summary of changes

1. We do all non-essential location config calls (setting up
secondaries,
detaches) at the end of the reconciliation. Previously, we bailed out
of the reconciliation on the first failure. With this patch we attempt
all of the RPCs.
This allows the observed state to update even if another RPC failed for
unrelated reasons.

2. If the overall reconciliation failed, we don't want to remove nodes
from the
observed state as a safe-guard. With the previous patch, we'll get a
deletion delta to process, which would be ignored. Ignoring it is not
the right thing to do since it's out of sync with the db state.
Hence, on reconciliation failures map deletion from the observed state
to the uncertain state. Future reconciliation will query the node to
refresh their observed state.

Closes LKB-204
---
 pageserver/src/http/routes.rs                 |  4 ++
 storage_controller/src/reconciler.rs          | 12 ++++-
 storage_controller/src/service.rs             | 17 +++++--
 .../regress/test_storage_controller.py        | 46 +++++++++++++++++++
 4 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a0ea9b90dc..669eeffa32 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2005,6 +2005,10 @@ async fn put_tenant_location_config_handler(
     let state = get_state(&request);
     let conf = state.conf;
 
+    fail::fail_point!("put-location-conf-handler", |_| {
+        Err(ApiError::ResourceUnavailable("failpoint".into()))
+    });
+
     // The `Detached` state is special, it doesn't upsert a tenant, it removes
     // its local disk content and drops it from memory.
     if let LocationConfigMode::Detached = request_data.config.mode {
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index d1590ec75e..ff5a3831cd 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -981,6 +981,7 @@ impl Reconciler {
             ));
         }
 
+        let mut first_err = None;
         for (node, conf) in changes {
             if self.cancel.is_cancelled() {
                 return Err(ReconcileError::Cancel);
@@ -990,7 +991,12 @@ impl Reconciler {
             // shard _available_ (the attached location), and configuring secondary locations
             // can be done lazily when the node becomes available (via background reconciliation).
             if node.is_available() {
-                self.location_config(&node, conf, None, false).await?;
+                let res = self.location_config(&node, conf, None, false).await;
+                if let Err(err) = res {
+                    if first_err.is_none() {
+                        first_err = Some(err);
+                    }
+                }
             } else {
                 // If the node is unavailable, we skip and consider the reconciliation successful: this
                 // is a common case where a pageserver is marked unavailable: we demote a location on
@@ -1002,6 +1008,10 @@ impl Reconciler {
             }
         }
 
+        if let Some(err) = first_err {
+            return Err(err);
+        }
+
         // The condition below identifies a detach. We must have no attached intent and
         // must have been attached to something previously. Pass this information to
         // the [`ComputeHook`] such that it can update its tenant-wide state.
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 8f5efe8ac4..4c011033af 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1530,10 +1530,19 @@ impl Service {
                 // so that waiters will see the correct error after waiting.
                 tenant.set_last_error(result.sequence, e);
 
-                // Skip deletions on reconcile failures
-                let upsert_deltas =
-                    deltas.filter(|delta| matches!(delta, ObservedStateDelta::Upsert(_)));
-                tenant.apply_observed_deltas(upsert_deltas);
+                // If the reconciliation failed, don't clear the observed state for places where we
+                // detached. Instead, mark the observed state as uncertain.
+                let failed_reconcile_deltas = deltas.map(|delta| {
+                    if let ObservedStateDelta::Delete(node_id) = delta {
+                        ObservedStateDelta::Upsert(Box::new((
+                            node_id,
+                            ObservedStateLocation { conf: None },
+                        )))
+                    } else {
+                        delta
+                    }
+                });
+                tenant.apply_observed_deltas(failed_reconcile_deltas);
             }
         }
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9986c1f24a..a6f05fe8ad 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -4959,3 +4959,49 @@ def test_storage_controller_forward_404(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.configure_failpoints(
         ("reconciler-live-migrate-post-generation-inc", "off")
     )
+
+
+def test_re_attach_with_stuck_secondary(neon_env_builder: NeonEnvBuilder):
+    """
+    This test assumes that the secondary location cannot be configured for whatever reason.
+    It then attempts to detach and and attach the tenant back again and, finally, checks
+    for observed state consistency by attempting to create a timeline.
+
+    See LKB-204 for more details.
+    """
+
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    env.storage_controller.allowed_errors.append(".*failpoint.*")
+
+    tenant_id, _ = env.create_tenant(shard_count=1, placement_policy='{"Attached":1}')
+    env.storage_controller.reconcile_until_idle()
+
+    locations = env.storage_controller.locate(tenant_id)
+    assert len(locations) == 1
+    primary: int = locations[0]["node_id"]
+
+    not_primary = [ps.id for ps in env.pageservers if ps.id != primary]
+    assert len(not_primary) == 1
+    secondary = not_primary[0]
+
+    env.get_pageserver(secondary).http_client().configure_failpoints(
+        ("put-location-conf-handler", "return(1)")
+    )
+
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+
+    with pytest.raises(Exception, match="failpoint"):
+        env.storage_controller.reconcile_all()
+
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 1}})
+
+    with pytest.raises(Exception, match="failpoint"):
+        env.storage_controller.reconcile_all()
+
+    env.storage_controller.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=TimelineId.generate()
+    )