From b0dfe0ffa66b68c99d5cfc99e62d0c81c95c09d2 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 25 Jul 2025 15:03:17 +0100 Subject: [PATCH] storcon: attempt all non-essential location config calls during reconciliations (#12745) ## Problem We saw the following in the field: Context and observations: * The storage controller keeps track of the latest generations and the pageserver that issued the latest generation in the database * When the storage controller needs to proxy a request (e.g. timeline creation) to the pageservers, it will find use the pageserver that issued the latest generation from the db (generation_pageserver). * pageserver-2.cell-2 got into a bad state and wasn't able to apply location_config (e.g. detach a shard) What happened: 1. pageserver-2.cell-2 was a secondary for our shard since we were not able to detach it 2. control plane asked to detach a tenant (presumably because it was idle) a. In response storcon clears the generation_pageserver from the db and attempts to detach all locations b. it tries to detach pageserver-2.cell-2 first, but fails, which fails the entire reconciliation leaving the good attached location still there c. return success to cplane 3. control plane asks to re-attach the tenant a. In response storcon performs a reconciliation b. it finds that the observed state matches the intent (remember we did not detach the primary at step(2)) c. skips incrementing the genration and setting the generation_pageserver column Now any requests that need to be proxied to pageservers and rely on the generation_pageserver db column fail because that's not set ## Summary of changes 1. We do all non-essential location config calls (setting up secondaries, detaches) at the end of the reconciliation. Previously, we bailed out of the reconciliation on the first failure. With this patch we attempt all of the RPCs. This allows the observed state to update even if another RPC failed for unrelated reasons. 2. If the overall reconciliation failed, we don't want to remove nodes from the observed state as a safe-guard. With the previous patch, we'll get a deletion delta to process, which would be ignored. Ignoring it is not the right thing to do since it's out of sync with the db state. Hence, on reconciliation failures map deletion from the observed state to the uncertain state. Future reconciliation will query the node to refresh their observed state. Closes LKB-204 --- pageserver/src/http/routes.rs | 4 ++ storage_controller/src/reconciler.rs | 12 ++++- storage_controller/src/service.rs | 17 +++++-- .../regress/test_storage_controller.py | 46 +++++++++++++++++++ 4 files changed, 74 insertions(+), 5 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a0ea9b90dc..669eeffa32 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2005,6 +2005,10 @@ async fn put_tenant_location_config_handler( let state = get_state(&request); let conf = state.conf; + fail::fail_point!("put-location-conf-handler", |_| { + Err(ApiError::ResourceUnavailable("failpoint".into())) + }); + // The `Detached` state is special, it doesn't upsert a tenant, it removes // its local disk content and drops it from memory. if let LocationConfigMode::Detached = request_data.config.mode { diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index d1590ec75e..ff5a3831cd 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -981,6 +981,7 @@ impl Reconciler { )); } + let mut first_err = None; for (node, conf) in changes { if self.cancel.is_cancelled() { return Err(ReconcileError::Cancel); @@ -990,7 +991,12 @@ impl Reconciler { // shard _available_ (the attached location), and configuring secondary locations // can be done lazily when the node becomes available (via background reconciliation). if node.is_available() { - self.location_config(&node, conf, None, false).await?; + let res = self.location_config(&node, conf, None, false).await; + if let Err(err) = res { + if first_err.is_none() { + first_err = Some(err); + } + } } else { // If the node is unavailable, we skip and consider the reconciliation successful: this // is a common case where a pageserver is marked unavailable: we demote a location on @@ -1002,6 +1008,10 @@ impl Reconciler { } } + if let Some(err) = first_err { + return Err(err); + } + // The condition below identifies a detach. We must have no attached intent and // must have been attached to something previously. Pass this information to // the [`ComputeHook`] such that it can update its tenant-wide state. diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 8f5efe8ac4..4c011033af 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1530,10 +1530,19 @@ impl Service { // so that waiters will see the correct error after waiting. tenant.set_last_error(result.sequence, e); - // Skip deletions on reconcile failures - let upsert_deltas = - deltas.filter(|delta| matches!(delta, ObservedStateDelta::Upsert(_))); - tenant.apply_observed_deltas(upsert_deltas); + // If the reconciliation failed, don't clear the observed state for places where we + // detached. Instead, mark the observed state as uncertain. + let failed_reconcile_deltas = deltas.map(|delta| { + if let ObservedStateDelta::Delete(node_id) = delta { + ObservedStateDelta::Upsert(Box::new(( + node_id, + ObservedStateLocation { conf: None }, + ))) + } else { + delta + } + }); + tenant.apply_observed_deltas(failed_reconcile_deltas); } } diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 9986c1f24a..a6f05fe8ad 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -4959,3 +4959,49 @@ def test_storage_controller_forward_404(neon_env_builder: NeonEnvBuilder): env.storage_controller.configure_failpoints( ("reconciler-live-migrate-post-generation-inc", "off") ) + + +def test_re_attach_with_stuck_secondary(neon_env_builder: NeonEnvBuilder): + """ + This test assumes that the secondary location cannot be configured for whatever reason. + It then attempts to detach and and attach the tenant back again and, finally, checks + for observed state consistency by attempting to create a timeline. + + See LKB-204 for more details. + """ + + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + env.storage_controller.allowed_errors.append(".*failpoint.*") + + tenant_id, _ = env.create_tenant(shard_count=1, placement_policy='{"Attached":1}') + env.storage_controller.reconcile_until_idle() + + locations = env.storage_controller.locate(tenant_id) + assert len(locations) == 1 + primary: int = locations[0]["node_id"] + + not_primary = [ps.id for ps in env.pageservers if ps.id != primary] + assert len(not_primary) == 1 + secondary = not_primary[0] + + env.get_pageserver(secondary).http_client().configure_failpoints( + ("put-location-conf-handler", "return(1)") + ) + + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + + with pytest.raises(Exception, match="failpoint"): + env.storage_controller.reconcile_all() + + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 1}}) + + with pytest.raises(Exception, match="failpoint"): + env.storage_controller.reconcile_all() + + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=TimelineId.generate() + )