storage controller: tenant scheduling policy (#7262)

## Problem In the event of bugs with scheduling or reconciliation, we need to be able to switch this off at a per-tenant granularity. This is intended to mitigate risk of issues with https://github.com/neondatabase/neon/pull/7181, which makes scheduling more involved. Closes: #7103 ## Summary of changes - Introduce a scheduling policy per tenant, with API to set it - Refactor persistent.rs helpers for updating tenants to be more general - Add tests
2026-01-04 03:52:56 +00:00 · 2024-03-28 14:19:25 +00:00
parent 5928f6709c
commit 6633332e67
10 changed files with 448 additions and 79 deletions
--- a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
@@ -0,0 +1,3 @@
+-- This file should undo anything in `up.sql`
+
+ALTER TABLE tenant_shards drop scheduling_policy;
--- a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
@@ -0,0 +1,2 @@
+
+ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -34,7 +34,8 @@ use utils::{
 };

 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
+    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
+    TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

@@ -478,6 +479,22 @@ async fn handle_tenant_shard_migrate(
    )
 }

+async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_update_policy(tenant_id, update_req)
+            .await?,
+    )
+}
+
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
@@ -509,6 +526,14 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.consistency_check().await?)
 }

+async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
+}
+
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
@@ -726,6 +751,9 @@ pub fn make_router(
                RequestName("debug_v1_consistency_check"),
            )
        })
+        .post("/debug/v1/reconcile_all", |r| {
+            request_span(r, handle_reconcile_all)
+        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
@@ -765,6 +793,13 @@ pub fn make_router(
                RequestName("control_v1_tenant_describe"),
            )
        })
+        .put("/control/v1/tenant/:tenant_id/policy", |r| {
+            named_request_span(
+                r,
+                handle_tenant_update_policy,
+                RequestName("control_v1_tenant_policy"),
+            )
+        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,6 +9,7 @@ use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
+use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::ShardConfigError;
@@ -107,6 +108,12 @@ pub(crate) enum AbortShardSplitStatus {

 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

+/// Some methods can operate on either a whole tenant or a single shard
+pub(crate) enum TenantFilter {
+    Tenant(TenantId),
+    Shard(TenantShardId),
+}
+
 impl Persistence {
    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -140,7 +147,7 @@ impl Persistence {
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let latency = &METRICS_REGISTRY
@@ -168,7 +175,7 @@ impl Persistence {
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let mut conn = self.connection_pool.get()?;
@@ -275,6 +282,11 @@ impl Persistence {
                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
                shard.placement_policy = "{\"Attached\":0}".to_string();
            }
+
+            if shard.scheduling_policy.is_empty() {
+                shard.scheduling_policy =
+                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
+            }
        }

        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
@@ -465,59 +477,45 @@ impl Persistence {
    /// that we only do the first time a tenant is set to an attached policy via /location_config.
    pub(crate) async fn update_tenant_shard(
        &self,
-        tenant_shard_id: TenantShardId,
-        input_placement_policy: PlacementPolicy,
-        input_config: TenantConfig,
+        tenant: TenantFilter,
+        input_placement_policy: Option<PlacementPolicy>,
+        input_config: Option<TenantConfig>,
        input_generation: Option<Generation>,
+        input_scheduling_policy: Option<ShardSchedulingPolicy>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = diesel::update(tenant_shards)
-                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
+            let query = match tenant {
+                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
+                    .into_boxed(),
+                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(input_tenant_id.to_string()))
+                    .into_boxed(),
+            };

-            if let Some(input_generation) = input_generation {
-                // Update includes generation column
-                query
-                    .set((
-                        generation.eq(Some(input_generation.into().unwrap() as i32)),
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
-            } else {
-                // Update does not include generation column
-                query
-                    .set((
-                        placement_policy
-                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
-                        config.eq(serde_json::to_string(&input_config).unwrap()),
-                    ))
-                    .execute(conn)?;
+            #[derive(AsChangeset)]
+            #[diesel(table_name = crate::schema::tenant_shards)]
+            struct ShardUpdate {
+                generation: Option<i32>,
+                placement_policy: Option<String>,
+                config: Option<String>,
+                scheduling_policy: Option<String>,
            }

-            Ok(())
-        })
-        .await?;
+            let update = ShardUpdate {
+                generation: input_generation.map(|g| g.into().unwrap() as i32),
+                placement_policy: input_placement_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
+                scheduling_policy: input_scheduling_policy
+                    .map(|p| serde_json::to_string(&p).unwrap()),
+            };

-        Ok(())
-    }
-
-    pub(crate) async fn update_tenant_config(
-        &self,
-        input_tenant_id: TenantId,
-        input_config: TenantConfig,
-    ) -> DatabaseResult<()> {
-        use crate::schema::tenant_shards::dsl::*;
-
-        self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| {
-            diesel::update(tenant_shards)
-                .filter(tenant_id.eq(input_tenant_id.to_string()))
-                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
-                .execute(conn)?;
+            query.set(update).execute(conn)?;

            Ok(())
        })
@@ -728,6 +726,8 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) splitting: SplitState,
    #[serde(default)]
    pub(crate) config: String,
+    #[serde(default)]
+    pub(crate) scheduling_policy: String,
 }

 impl TenantShardPersistence {
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -22,6 +22,7 @@ diesel::table! {
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
+        scheduling_policy -> Varchar,
    }
 }

--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -8,7 +8,9 @@ use std::{
 };

 use crate::{
-    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
+    id_lock_map::IdLockMap,
+    persistence::{AbortShardSplitStatus, TenantFilter},
+    reconciler::ReconcileError,
 };
 use anyhow::Context;
 use control_plane::storage_controller::{
@@ -20,9 +22,10 @@ use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantShardMigrateRequest,
-        TenantShardMigrateResponse, UtilizationScore,
+        ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
+        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
+        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
+        UtilizationScore,
    },
    models::{SecondaryProgress, TenantConfigRequest},
 };
@@ -51,7 +54,6 @@ use utils::{
    generation::Generation,
    http::error::ApiError,
    id::{NodeId, TenantId, TimelineId},
-    seqwait::SeqWait,
    sync::gate::Gate,
 };

@@ -66,7 +68,6 @@ use crate::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
        ReconcilerWaiter, TenantState,
    },
-    Sequence,
 };

 // For operations that should be quick, like attaching a new tenant
@@ -957,30 +958,14 @@ impl Service {
        }
        for tsp in tenant_shard_persistence {
            let tenant_shard_id = tsp.get_tenant_shard_id()?;
-            let shard_identity = tsp.get_shard_identity()?;
+
            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
            // it with what we can infer: the node for which a generation was most recently issued.
            let mut intent = IntentState::new();
            if let Some(generation_pageserver) = tsp.generation_pageserver {
                intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
            }
-
-            let new_tenant = TenantState {
-                tenant_shard_id,
-                shard: shard_identity,
-                sequence: Sequence::initial(),
-                generation: tsp.generation.map(|g| Generation::new(g as u32)),
-                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-                intent,
-                observed: ObservedState::new(),
-                config: serde_json::from_str(&tsp.config).unwrap(),
-                reconciler: None,
-                splitting: tsp.splitting,
-                waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                last_error: Arc::default(),
-                pending_compute_notification: false,
-            };
+            let new_tenant = TenantState::from_persistent(tsp, intent)?;

            tenants.insert(tenant_shard_id, new_tenant);
        }
@@ -1104,6 +1089,8 @@ impl Service {
                placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
                splitting: SplitState::default(),
+                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                    .unwrap(),
            };

            match self.persistence.insert_tenant_shards(vec![tsp]).await {
@@ -1156,9 +1143,10 @@ impl Service {
                    // when we reattaching a detached tenant.
                    self.persistence
                        .update_tenant_shard(
-                            attach_req.tenant_shard_id,
-                            PlacementPolicy::Attached(0),
-                            conf,
+                            TenantFilter::Shard(attach_req.tenant_shard_id),
+                            Some(PlacementPolicy::Attached(0)),
+                            Some(conf),
+                            None,
                            None,
                        )
                        .await?;
@@ -1615,6 +1603,8 @@ impl Service {
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
                splitting: SplitState::default(),
+                scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                    .unwrap(),
            })
            .collect();

@@ -1907,10 +1897,11 @@ impl Service {
                {
                    self.persistence
                        .update_tenant_shard(
-                            *tenant_shard_id,
-                            placement_policy.clone(),
-                            tenant_config.clone(),
+                            TenantFilter::Shard(*tenant_shard_id),
+                            Some(placement_policy.clone()),
+                            Some(tenant_config.clone()),
                            *generation,
+                            None,
                        )
                        .await?;
                }
@@ -1988,7 +1979,13 @@ impl Service {
        let config = req.config;

        self.persistence
-            .update_tenant_config(req.tenant_id, config.clone())
+            .update_tenant_shard(
+                TenantFilter::Tenant(req.tenant_id),
+                None,
+                Some(config.clone()),
+                None,
+                None,
+            )
            .await?;

        let waiters = {
@@ -2341,6 +2338,57 @@ impl Service {
        Ok(StatusCode::NOT_FOUND)
    }

+    /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig"
+    /// for a tenant.  The TenantConfig is passed through to pageservers, whereas this function modifies
+    /// the tenant's policies (configuration) within the storage controller
+    pub(crate) async fn tenant_update_policy(
+        &self,
+        tenant_id: TenantId,
+        req: TenantPolicyRequest,
+    ) -> Result<(), ApiError> {
+        // We require an exclusive lock, because we are updating persistent and in-memory state
+        let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await;
+
+        let TenantPolicyRequest {
+            placement,
+            scheduling,
+        } = req;
+
+        self.persistence
+            .update_tenant_shard(
+                TenantFilter::Tenant(tenant_id),
+                placement.clone(),
+                None,
+                None,
+                scheduling,
+            )
+            .await?;
+
+        let mut locked = self.inner.write().unwrap();
+        let (nodes, tenants, scheduler) = locked.parts_mut();
+        for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
+            if let Some(placement) = &placement {
+                shard.policy = placement.clone();
+
+                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
+                               "Updated placement policy to {placement:?}");
+            }
+
+            if let Some(scheduling) = &scheduling {
+                shard.set_scheduling_policy(*scheduling);
+
+                tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(),
+                               "Updated scheduling policy to {scheduling:?}");
+            }
+
+            // In case scheduling is being switched back on, try it now.
+            shard.schedule(scheduler).ok();
+            self.maybe_reconcile_shard(shard, nodes);
+        }
+
+        Ok(())
+    }
+
    pub(crate) async fn tenant_timeline_create(
        &self,
        tenant_id: TenantId,
@@ -3250,6 +3298,10 @@ impl Service {
                    placement_policy: serde_json::to_string(&policy).unwrap(),
                    config: serde_json::to_string(&config).unwrap(),
                    splitting: SplitState::Splitting,
+
+                    // Scheduling policies do not carry through to children
+                    scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
+                        .unwrap(),
                });
            }

@@ -3970,6 +4022,28 @@ impl Service {
        reconciles_spawned
    }

+    /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
+    /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
+    /// put the system into a quiescent state where future background reconciliations won't do anything.
+    pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
+        self.reconcile_all();
+
+        let waiters = {
+            let mut waiters = Vec::new();
+            let locked = self.inner.read().unwrap();
+            for (_tenant_shard_id, shard) in locked.tenants.iter() {
+                if let Some(waiter) = shard.get_waiter() {
+                    waiters.push(waiter);
+                }
+            }
+            waiters
+        };
+
+        let waiter_count = waiters.len();
+        self.await_waiters(waiters, RECONCILE_TIMEOUT).await?;
+        Ok(waiter_count)
+    }
+
    pub async fn shutdown(&self) {
        // Note that this already stops processing any results from reconciles: so
        // we do not expect that our [`TenantState`] objects will reach a neat
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -8,7 +8,7 @@ use crate::{
    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
    persistence::TenantShardPersistence,
 };
-use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -116,6 +116,10 @@ pub(crate) struct TenantState {
    /// sending it.  This is the mechanism by which compute notifications are included in the scope
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,
+
+    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
+    // be set to a non-active state to avoid making changes while the issue is fixed.
+    scheduling_policy: ShardSchedulingPolicy,
 }

 #[derive(Default, Clone, Debug, Serialize)]
@@ -370,6 +374,7 @@ impl TenantState {
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
            pending_compute_notification: false,
+            scheduling_policy: ShardSchedulingPolicy::default(),
        }
    }

@@ -453,6 +458,16 @@ impl TenantState {
        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
        // change their attach location.

+        match self.scheduling_policy {
+            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
+            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
+                // Warn to make it obvious why other things aren't happening/working, if we skip scheduling
+                tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
+                    "Scheduling is disabled by policy {:?}", self.scheduling_policy);
+                return Ok(());
+            }
+        }
+
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut modified = false;
@@ -668,6 +683,19 @@ impl TenantState {
            }
        }

+        // Pre-checks done: finally check whether we may actually do the work
+        match self.scheduling_policy {
+            ShardSchedulingPolicy::Active
+            | ShardSchedulingPolicy::Essential
+            | ShardSchedulingPolicy::Pause => {}
+            ShardSchedulingPolicy::Stop => {
+                // We only reach this point if there is work to do and we're going to skip
+                // doing it: warn it obvious why this tenant isn't doing what it ought to.
+                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
+                return None;
+            }
+        }
+
        // Build list of nodes from which the reconciler should detach
        let mut detach = Vec::new();
        for node_id in self.observed.locations.keys() {
@@ -804,6 +832,22 @@ impl TenantState {
        })
    }

+    /// Get a waiter for any reconciliation in flight, but do not start reconciliation
+    /// if it is not already running
+    pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
+        if self.reconciler.is_some() {
+            Some(ReconcilerWaiter {
+                tenant_shard_id: self.tenant_shard_id,
+                seq_wait: self.waiter.clone(),
+                error_seq_wait: self.error_waiter.clone(),
+                error: self.last_error.clone(),
+                seq: self.sequence,
+            })
+        } else {
+            None
+        }
+    }
+
    /// Called when a ReconcileResult has been emitted and the service is updating
    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
    /// the handle to indicate there is no longer a reconciliation in progress.
@@ -829,6 +873,36 @@ impl TenantState {
        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
    }

+    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
+        self.scheduling_policy = p;
+    }
+
+    pub(crate) fn from_persistent(
+        tsp: TenantShardPersistence,
+        intent: IntentState,
+    ) -> anyhow::Result<Self> {
+        let tenant_shard_id = tsp.get_tenant_shard_id()?;
+        let shard_identity = tsp.get_shard_identity()?;
+
+        Ok(Self {
+            tenant_shard_id,
+            shard: shard_identity,
+            sequence: Sequence::initial(),
+            generation: tsp.generation.map(|g| Generation::new(g as u32)),
+            policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+            intent,
+            observed: ObservedState::new(),
+            config: serde_json::from_str(&tsp.config).unwrap(),
+            reconciler: None,
+            splitting: tsp.splitting,
+            waiter: Arc::new(SeqWait::new(Sequence::initial())),
+            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+            last_error: Arc::default(),
+            pending_compute_notification: false,
+            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
+        })
+    }
+
    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
        TenantShardPersistence {
            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -840,6 +914,7 @@ impl TenantState {
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
+            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
        }
    }
 }
@@ -980,4 +1055,25 @@ pub(crate) mod tests {
        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }
+
+    #[test]
+    fn scheduling_mode() -> anyhow::Result<()> {
+        let nodes = make_test_nodes(3);
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+
+        // In pause mode, schedule() shouldn't do anything
+        tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
+        assert!(tenant_state.schedule(&mut scheduler).is_ok());
+        assert!(tenant_state.intent.all_pageservers().is_empty());
+
+        // In active mode, schedule() works
+        tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
+        assert!(tenant_state.schedule(&mut scheduler).is_ok());
+        assert!(!tenant_state.intent.all_pageservers().is_empty());
+
+        tenant_state.intent.clear(&mut scheduler);
+        Ok(())
+    }
 }
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -42,6 +42,12 @@ pub struct NodeConfigureRequest {
    pub scheduling: Option<NodeSchedulingPolicy>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantPolicyRequest {
+    pub placement: Option<PlacementPolicy>,
+    pub scheduling: Option<ShardSchedulingPolicy>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -170,6 +176,32 @@ impl FromStr for NodeAvailability {
    }
 }

+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+pub enum ShardSchedulingPolicy {
+    // Normal mode: the tenant's scheduled locations may be updated at will, including
+    // for non-essential optimization.
+    Active,
+
+    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
+    // For example, this still permits a node's attachment location to change to a secondary in
+    // response to a node failure, or to assign a new secondary if a node was removed.
+    Essential,
+
+    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
+    // unavailable, it will not be rescheduled to another node.
+    Pause,
+
+    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
+    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
+    Stop,
+}
+
+impl Default for ShardSchedulingPolicy {
+    fn default() -> Self {
+        Self::Active
+    }
+}
+
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
    Active,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2116,6 +2116,7 @@ class NeonStorageController(MetricsGetter):
        shard_count: Optional[int] = None,
        shard_stripe_size: Optional[int] = None,
        tenant_config: Optional[Dict[Any, Any]] = None,
+        placement_policy: Optional[str] = None,
    ):
        """
        Use this rather than pageserver_api() when you need to include shard parameters
@@ -2135,6 +2136,8 @@ class NeonStorageController(MetricsGetter):
            for k, v in tenant_config.items():
                body[k] = v

+        body["placement_policy"] = placement_policy
+
        response = self.request(
            "POST",
            f"{self.env.storage_controller_api}/v1/tenant",
@@ -2193,6 +2196,34 @@ class NeonStorageController(MetricsGetter):
        log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}")
        assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id

+    def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]):
+        log.info(f"tenant_policy_update({tenant_id}, {body})")
+        self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy",
+            json=body,
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+    def reconcile_all(self):
+        r = self.request(
+            "POST",
+            f"{self.env.storage_controller_api}/debug/v1/reconcile_all",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        r.raise_for_status()
+        n = r.json()
+        log.info(f"reconcile_all waited for {n} shards")
+        return n
+
+    def reconcile_until_idle(self, timeout_secs=30):
+        start_at = time.time()
+        n = 1
+        while n > 0:
+            n = self.reconcile_all()
+            if time.time() - start_at > timeout_secs:
+                raise RuntimeError("Timeout in reconcile_until_idle")
+
    def consistency_check(self):
        """
        Throw an exception if the service finds any inconsistencies in its state
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1015,3 +1015,98 @@ def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
    )
    assert reconciles_after_restart == reconciles_before_restart
+
+
+def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that emergency hooks for disabling rogue tenants' reconcilers work as expected.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            # We will intentionally cause reconcile errors
+            ".*Reconcile error.*",
+            # Message from using a scheduling policy
+            ".*Scheduling is disabled by policy.*",
+            ".*Skipping reconcile for policy.*",
+            # Message from a node being offline
+            ".*Call to node .* management API .* failed",
+        ]
+    )
+
+    # Stop pageserver so that reconcile cannot complete
+    env.pageserver.stop()
+
+    env.storage_controller.tenant_create(tenant_id, placement_policy="Detached")
+
+    # Try attaching it: we should see reconciles failing
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "placement": {"Attached": 0},
+        },
+    )
+
+    def reconcile_errors() -> int:
+        return int(
+            env.storage_controller.get_metric_value(
+                "storage_controller_reconcile_complete_total", filter={"status": "error"}
+            )
+            or 0
+        )
+
+    def reconcile_ok() -> int:
+        return int(
+            env.storage_controller.get_metric_value(
+                "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+            )
+            or 0
+        )
+
+    def assert_errors_gt(n) -> int:
+        e = reconcile_errors()
+        assert e > n
+        return e
+
+    errs = wait_until(10, 1, lambda: assert_errors_gt(0))
+
+    # Try reconciling again, it should fail again
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.reconcile_all()
+    errs = wait_until(10, 1, lambda: assert_errors_gt(errs))
+
+    # Configure the tenant to disable reconciles
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "scheduling": "Stop",
+        },
+    )
+
+    # Try reconciling again, it should not cause an error (silently skip)
+    env.storage_controller.reconcile_all()
+    assert reconcile_errors() == errs
+
+    # Start the pageserver and re-enable reconciles
+    env.pageserver.start()
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "scheduling": "Active",
+        },
+    )
+
+    def assert_ok_gt(n) -> int:
+        o = reconcile_ok()
+        assert o > n
+        return o
+
+    # We should see a successful reconciliation
+    wait_until(10, 1, lambda: assert_ok_gt(0))
+
+    # And indeed the tenant should be attached
+    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
				`@@ -0,0 +1,2 @@`

				`ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"';`