storcon: automatically clear Pause/Stop scheduling policies to enable detaches (#10011)

## Problem We saw a tenant get stuck when it had been put into Pause scheduling mode to pin it to a pageserver, then it was left idle for a while and the control plane tried to detach it. Close: https://github.com/neondatabase/neon/issues/9957 ## Summary of changes - When changing policy to Detached or Secondary, set the scheduling policy to Active. - Add a test that exercises this - When persisting tenant shards, set their `generation_pageserver` to null if the placement policy is not Attached (this enables consistency checks to work, and avoids leaving state in the DB that could be confusing/misleading in future)
2026-01-07 05:22:56 +00:00 · 2024-12-07 13:05:09 +00:00
parent 4d7111f240
commit ec790870d5
4 changed files with 109 additions and 2 deletions
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }

+/// Scheduling policy enables us to selectively disable some automatic actions that the
+/// controller performs on a tenant shard. This is only set to a non-default value by
+/// human intervention, and it is reset to the default value (Active) when the tenant's
+/// placement policy is modified away from Attached.
+///
+/// The typical use of a non-Active scheduling policy is one of:
+/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
+/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
+///
+/// If you're not sure which policy to use to pin a shard to its current location, you probably
+/// want Pause.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
    // Normal mode: the tenant's scheduled locations may be updated at will, including
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -636,6 +636,13 @@ impl Persistence {
                    .into_boxed(),
            };

+            // Clear generation_pageserver if we are moving into a state where we won't have
+            // any attached pageservers.
+            let input_generation_pageserver = match input_placement_policy {
+                None | Some(PlacementPolicy::Attached(_)) => None,
+                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+            };
+
            #[derive(AsChangeset)]
            #[diesel(table_name = crate::schema::tenant_shards)]
            struct ShardUpdate {
@@ -643,6 +650,7 @@ impl Persistence {
                placement_policy: Option<String>,
                config: Option<String>,
                scheduling_policy: Option<String>,
+                generation_pageserver: Option<Option<i64>>,
            }

            let update = ShardUpdate {
@@ -655,6 +663,7 @@ impl Persistence {
                    .map(|c| serde_json::to_string(&c).unwrap()),
                scheduling_policy: input_scheduling_policy
                    .map(|p| serde_json::to_string(&p).unwrap()),
+                generation_pageserver: input_generation_pageserver,
            };

            query.set(update).execute(conn)?;
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -513,6 +513,9 @@ struct ShardUpdate {

    /// If this is None, generation is not updated.
    generation: Option<Generation>,
+
+    /// If this is None, scheduling policy is not updated.
+    scheduling_policy: Option<ShardSchedulingPolicy>,
 }

 enum StopReconciliationsReason {
@@ -2376,6 +2379,23 @@ impl Service {
            }
        };

+        // Ordinarily we do not update scheduling policy, but when making major changes
+        // like detaching or demoting to secondary-only, we need to force the scheduling
+        // mode to Active, or the caller's expected outcome (detach it) will not happen.
+        let scheduling_policy = match req.config.mode {
+            LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                // Special case: when making major changes like detaching or demoting to secondary-only,
+                // we need to force the scheduling mode to Active, or nothing will happen.
+                Some(ShardSchedulingPolicy::Active)
+            }
+            LocationConfigMode::AttachedMulti
+            | LocationConfigMode::AttachedSingle
+            | LocationConfigMode::AttachedStale => {
+                // While attached, continue to respect whatever the existing scheduling mode is.
+                None
+            }
+        };
+
        let mut create = true;
        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
            // Saw an existing shard: this is not a creation
@@ -2401,6 +2421,7 @@ impl Service {
                placement_policy: placement_policy.clone(),
                tenant_config: req.config.tenant_conf.clone(),
                generation: set_generation,
+                scheduling_policy,
            });
        }

@@ -2497,6 +2518,7 @@ impl Service {
                    placement_policy,
                    tenant_config,
                    generation,
+                    scheduling_policy,
                } in &updates
                {
                    self.persistence
@@ -2505,7 +2527,7 @@ impl Service {
                            Some(placement_policy.clone()),
                            Some(tenant_config.clone()),
                            *generation,
-                            None,
+                            *scheduling_policy,
                        )
                        .await?;
                }
@@ -2521,6 +2543,7 @@ impl Service {
                        placement_policy,
                        tenant_config,
                        generation: update_generation,
+                        scheduling_policy,
                    } in updates
                    {
                        let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2539,6 +2562,10 @@ impl Service {
                            shard.generation = Some(generation);
                        }

+                        if let Some(scheduling_policy) = scheduling_policy {
+                            shard.set_scheduling_policy(scheduling_policy);
+                        }
+
                        shard.schedule(scheduler, &mut schedule_context)?;

                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
@@ -2992,9 +3019,17 @@ impl Service {

        let TenantPolicyRequest {
            placement,
-            scheduling,
+            mut scheduling,
        } = req;

+        if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement {
+            // When someone configures a tenant to detach, we force the scheduling policy to enable
+            // this to take effect.
+            if scheduling.is_none() {
+                scheduling = Some(ShardSchedulingPolicy::Active);
+            }
+        }
+
        self.persistence
            .update_tenant_shard(
                TenantFilter::Tenant(tenant_id),
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3230,3 +3230,55 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr
        # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
        env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
        raise
+
+
+@run_only_on_default_postgres("Postgres version makes no difference here")
+def test_storage_controller_detached_stopped(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that detaching a tenant while it has scheduling policy set to Paused or Stop works
+    """
+
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    neon_env_builder.num_pageservers = 1
+
+    env = neon_env_builder.init_configs()
+    env.start()
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id,
+        shard_count=1,
+    )
+
+    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
+
+    # Disable scheduling: ordinarily this would prevent the tenant's configuration being
+    # reconciled to pageservers, but this should be overridden when detaching.
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*")
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {"scheduling": "Stop"},
+    )
+
+    env.storage_controller.consistency_check()
+
+    # Detach the tenant
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.storage_controller.consistency_check()
+
+    # Confirm the detach happened
+    assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []