Merge remote-tracking branch 'origin/main' into communicator-rewrite

2026-05-22 07:30:37 +00:00 · 2025-07-05 16:59:51 +03:00
parent f3a6c0d8ff b568189f7b
commit e14bb4be39
141 changed files with 5475 additions and 2033 deletions
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -6,6 +6,11 @@ use utils::id::NodeId;

 pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;

+#[derive(Copy, Clone)]
+pub(crate) struct Delete {
+    pub(crate) node_id: NodeId,
+}
+
 #[derive(Copy, Clone)]
 pub(crate) struct Drain {
    pub(crate) node_id: NodeId,
@@ -18,6 +23,7 @@ pub(crate) struct Fill {

 #[derive(Copy, Clone)]
 pub(crate) enum Operation {
+    Delete(Delete),
    Drain(Drain),
    Fill(Fill),
 }
@@ -30,6 +36,8 @@ pub(crate) enum OperationError {
    FinalizeError(Cow<'static, str>),
    #[error("Operation cancelled")]
    Cancelled,
+    #[error("Impossible constraint error: {0}")]
+    ImpossibleConstraint(Cow<'static, str>),
 }

 pub(crate) struct OperationHandler {
@@ -38,6 +46,12 @@ pub(crate) struct OperationHandler {
    pub(crate) cancel: CancellationToken,
 }

+impl Display for Delete {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "delete {}", self.node_id)
+    }
+}
+
 impl Display for Drain {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, "drain {}", self.node_id)
@@ -53,6 +67,7 @@ impl Display for Fill {
 impl Display for Operation {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
+            Operation::Delete(op) => write!(f, "{op}"),
            Operation::Drain(op) => write!(f, "{op}"),
            Operation::Fill(op) => write!(f, "{op}"),
        }
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -919,7 +919,7 @@ async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError
    json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
 }

-async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_node_delete_old(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

    let req = match maybe_forward(req).await {
@@ -931,7 +931,10 @@ async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiErr

    let state = get_state(&req);
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
-    json_response(StatusCode::OK, state.service.node_delete(node_id).await?)
+    json_response(
+        StatusCode::OK,
+        state.service.node_delete_old(node_id).await?,
+    )
 }

 async fn handle_tombstone_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1051,6 +1054,42 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
    json_response(StatusCode::OK, leader)
 }

+async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    json_response(
+        StatusCode::OK,
+        state.service.start_node_delete(node_id).await?,
+    )
+}
+
+async fn handle_cancel_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Infra)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    json_response(
+        StatusCode::ACCEPTED,
+        state.service.cancel_node_delete(node_id).await?,
+    )
+}
+
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Infra)?;

@@ -2221,8 +2260,14 @@ pub fn make_router(
        .post("/control/v1/node", |r| {
            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
        })
+        // This endpoint is deprecated and will be removed in a future version.
+        // Use PUT /control/v1/node/:node_id/delete instead.
        .delete("/control/v1/node/:node_id", |r| {
-            named_request_span(r, handle_node_delete, RequestName("control_v1_node_delete"))
+            named_request_span(
+                r,
+                handle_node_delete_old,
+                RequestName("control_v1_node_delete"),
+            )
        })
        .get("/control/v1/node", |r| {
            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
@@ -2247,6 +2292,20 @@ pub fn make_router(
        .get("/control/v1/leader", |r| {
            named_request_span(r, handle_get_leader, RequestName("control_v1_get_leader"))
        })
+        .put("/control/v1/node/:node_id/delete", |r| {
+            named_request_span(
+                r,
+                handle_node_delete,
+                RequestName("control_v1_start_node_delete"),
+            )
+        })
+        .delete("/control/v1/node/:node_id/delete", |r| {
+            named_request_span(
+                r,
+                handle_cancel_node_delete,
+                RequestName("control_v1_cancel_node_delete"),
+            )
+        })
        .put("/control/v1/node/:node_id/drain", |r| {
            named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
        })
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -6,13 +6,13 @@ extern crate hyper0 as hyper;
 mod auth;
 mod background_node_operations;
 mod compute_hook;
-mod drain_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
 mod leadership;
 pub mod metrics;
 mod node;
+mod operation_utils;
 mod pageserver_client;
 mod peer_client;
 pub mod persistence;
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -76,6 +76,9 @@ pub(crate) struct StorageControllerMetricGroup {
    /// How many shards would like to reconcile but were blocked by concurrency limits
    pub(crate) storage_controller_pending_reconciles: measured::Gauge,

+    /// How many shards are keep-failing and will be ignored when considering to run optimizations
+    pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge,
+
    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_http_request_status:
        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -201,6 +201,7 @@ impl Node {

        match self.scheduling {
            NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
+            NodeSchedulingPolicy::Deleting => MaySchedule::No,
            NodeSchedulingPolicy::Draining => MaySchedule::No,
            NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
            NodeSchedulingPolicy::Pause => MaySchedule::No,
--- a/storage_controller/src/operation_utils.rs
+++ b/storage_controller/src/operation_utils.rs
@@ -10,63 +10,19 @@ use crate::node::Node;
 use crate::scheduler::Scheduler;
 use crate::tenant_shard::TenantShard;

-pub(crate) struct TenantShardIterator<F> {
-    tenants_accessor: F,
-    inspected_all_shards: bool,
-    last_inspected_shard: Option<TenantShardId>,
-}
-
-/// A simple iterator which can be used in tandem with [`crate::service::Service`]
-/// to iterate over all known tenant shard ids without holding the lock on the
-/// service state at all times.
-impl<F> TenantShardIterator<F>
-where
-    F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
-{
-    pub(crate) fn new(tenants_accessor: F) -> Self {
-        Self {
-            tenants_accessor,
-            inspected_all_shards: false,
-            last_inspected_shard: None,
-        }
-    }
-
-    /// Returns the next tenant shard id if one exists
-    pub(crate) fn next(&mut self) -> Option<TenantShardId> {
-        if self.inspected_all_shards {
-            return None;
-        }
-
-        match (self.tenants_accessor)(self.last_inspected_shard) {
-            Some(tid) => {
-                self.last_inspected_shard = Some(tid);
-                Some(tid)
-            }
-            None => {
-                self.inspected_all_shards = true;
-                None
-            }
-        }
-    }
-
-    /// Returns true when the end of the iterator is reached and false otherwise
-    pub(crate) fn finished(&self) -> bool {
-        self.inspected_all_shards
-    }
-}
-
 /// Check that the state of the node being drained is as expected:
-/// node is present in memory and scheduling policy is set to [`NodeSchedulingPolicy::Draining`]
+/// node is present in memory and scheduling policy is set to expected_policy
 pub(crate) fn validate_node_state(
    node_id: &NodeId,
    nodes: Arc<HashMap<NodeId, Node>>,
+    expected_policy: NodeSchedulingPolicy,
 ) -> Result<(), OperationError> {
    let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged(
        format!("node {node_id} was removed").into(),
    ))?;

    let current_policy = node.get_scheduling();
-    if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
+    if current_policy != expected_policy {
        // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
        // about it
        return Err(OperationError::NodeStateChanged(
@@ -182,55 +138,3 @@ impl TenantShardDrain {
        }
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use utils::id::TenantId;
-    use utils::shard::{ShardCount, ShardNumber, TenantShardId};
-
-    use super::TenantShardIterator;
-
-    #[test]
-    fn test_tenant_shard_iterator() {
-        let tenant_id = TenantId::generate();
-        let shard_count = ShardCount(8);
-
-        let mut tenant_shards = Vec::default();
-        for i in 0..shard_count.0 {
-            tenant_shards.push((
-                TenantShardId {
-                    tenant_id,
-                    shard_number: ShardNumber(i),
-                    shard_count,
-                },
-                (),
-            ))
-        }
-
-        let tenant_shards = Arc::new(tenant_shards);
-
-        let mut tid_iter = TenantShardIterator::new({
-            let tenants = tenant_shards.clone();
-            move |last_inspected_shard: Option<TenantShardId>| {
-                let entry = match last_inspected_shard {
-                    Some(skip_past) => {
-                        let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
-                        cursor.nth(1)
-                    }
-                    None => tenants.first(),
-                };
-
-                entry.map(|(tid, _)| tid).copied()
-            }
-        });
-
-        let mut iterated_over = Vec::default();
-        while let Some(tid) = tid_iter.next() {
-            iterated_over.push((tid, ()));
-        }
-
-        assert_eq!(iterated_over, *tenant_shards);
-    }
-}
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -635,18 +635,23 @@ impl Persistence {
        let updated = self
            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                Box::pin(async move {
-                    // Check if the node is not marked as deleted
-                    let deleted_node: i64 = nodes
+                    let node: Option<NodePersistence> = nodes
                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .filter(lifecycle.eq(String::from(NodeLifecycle::Deleted)))
-                        .count()
-                        .get_result(conn)
-                        .await?;
-                    if deleted_node > 0 {
-                        return Err(DatabaseError::Logical(format!(
-                            "Node {input_node_id} is marked as deleted, re-attach is not allowed"
-                        )));
-                    }
+                        .first::<NodePersistence>(conn)
+                        .await
+                        .optional()?;
+
+                    // Check if the node is not marked as deleted
+                    match node {
+                        Some(node) if matches!(NodeLifecycle::from_str(&node.lifecycle), Ok(NodeLifecycle::Deleted)) => {
+                            return Err(DatabaseError::Logical(format!(
+                                "Node {input_node_id} is marked as deleted, re-attach is not allowed"
+                            )));
+                        }
+                        _ => {
+                            // go through
+                        }
+                    };

                    let rows_updated = diesel::update(tenant_shards)
                        .filter(generation_pageserver.eq(input_node_id.0 as i64))
@@ -664,21 +669,23 @@ impl Persistence {
                        .load(conn)
                        .await?;

-                    // If the node went through a drain and restart phase before re-attaching,
-                    // then reset it's node scheduling policy to active.
-                    diesel::update(nodes)
-                        .filter(node_id.eq(input_node_id.0 as i64))
-                        .filter(
-                            scheduling_policy
-                                .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Draining)))
-                                .or(scheduling_policy
-                                    .eq(String::from(NodeSchedulingPolicy::Filling))),
-                        )
-                        .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
-                        .execute(conn)
-                        .await?;
+                    if let Some(node) = node {
+                        let old_scheduling_policy =
+                            NodeSchedulingPolicy::from_str(&node.scheduling_policy).unwrap();
+                        let new_scheduling_policy = match old_scheduling_policy {
+                            NodeSchedulingPolicy::Active => NodeSchedulingPolicy::Active,
+                            NodeSchedulingPolicy::PauseForRestart => NodeSchedulingPolicy::Active,
+                            NodeSchedulingPolicy::Draining => NodeSchedulingPolicy::Active,
+                            NodeSchedulingPolicy::Filling => NodeSchedulingPolicy::Active,
+                            NodeSchedulingPolicy::Pause => NodeSchedulingPolicy::Pause,
+                            NodeSchedulingPolicy::Deleting => NodeSchedulingPolicy::Pause,
+                        };
+                        diesel::update(nodes)
+                            .filter(node_id.eq(input_node_id.0 as i64))
+                            .set(scheduling_policy.eq(String::from(new_scheduling_policy)))
+                            .execute(conn)
+                            .await?;
+                    }

                    Ok(updated)
                })
@@ -1388,6 +1395,48 @@ impl Persistence {
        .await
    }

+    /// Activate the given safekeeper, ensuring that there is no TOCTOU.
+    /// Returns `Some` if the safekeeper has indeed been activating (or already active). Other states return `None`.
+    pub(crate) async fn activate_safekeeper(&self, id_: i64) -> Result<Option<()>, DatabaseError> {
+        use crate::schema::safekeepers::dsl::*;
+
+        self.with_conn(move |conn| {
+            Box::pin(async move {
+                #[derive(Insertable, AsChangeset)]
+                #[diesel(table_name = crate::schema::safekeepers)]
+                struct UpdateSkSchedulingPolicy<'a> {
+                    id: i64,
+                    scheduling_policy: &'a str,
+                }
+                let scheduling_policy_active = String::from(SkSchedulingPolicy::Active);
+                let scheduling_policy_activating = String::from(SkSchedulingPolicy::Activating);
+
+                let rows_affected = diesel::update(
+                    safekeepers.filter(id.eq(id_)).filter(
+                        scheduling_policy
+                            .eq(scheduling_policy_activating)
+                            .or(scheduling_policy.eq(&scheduling_policy_active)),
+                    ),
+                )
+                .set(scheduling_policy.eq(&scheduling_policy_active))
+                .execute(conn)
+                .await?;
+
+                if rows_affected == 0 {
+                    return Ok(Some(()));
+                }
+                if rows_affected != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({rows_affected})",
+                    )));
+                }
+
+                Ok(Some(()))
+            })
+        })
+        .await
+    }
+
    /// Persist timeline. Returns if the timeline was newly inserted. If it wasn't, we haven't done any writes.
    pub(crate) async fn insert_timeline(&self, entry: TimelinePersistence) -> DatabaseResult<bool> {
        use crate::schema::timelines;
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1,8 +1,8 @@
 pub mod chaos_injector;
-mod context_iterator;
 pub mod feature_flag;
 pub(crate) mod safekeeper_reconciler;
 mod safekeeper_service;
+mod tenant_shard_iterator;

 use std::borrow::Cow;
 use std::cmp::Ordering;
@@ -16,7 +16,6 @@ use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant, SystemTime};

 use anyhow::Context;
-use context_iterator::TenantShardContextIterator;
 use control_plane::storage_controller::{
    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
 };
@@ -31,8 +30,8 @@ use pageserver_api::controller_api::{
    AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
    NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
    ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
-    TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-    TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
+    SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
+    TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use pageserver_api::models::{
@@ -55,6 +54,7 @@ use pageserver_client::{BlockUnblock, mgmt_api};
 use reqwest::{Certificate, StatusCode};
 use safekeeper_api::models::SafekeeperUtilization;
 use safekeeper_reconciler::SafekeeperReconcilers;
+use tenant_shard_iterator::{TenantShardExclusiveIterator, create_shared_shard_iterator};
 use tokio::sync::TryAcquireError;
 use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
@@ -68,10 +68,9 @@ use utils::sync::gate::{Gate, GateGuard};
 use utils::{failpoint_support, pausable_failpoint};

 use crate::background_node_operations::{
-    Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler,
+    Delete, Drain, Fill, MAX_RECONCILES_PER_OPERATION, Operation, OperationError, OperationHandler,
 };
 use crate::compute_hook::{self, ComputeHook, NotifyError};
-use crate::drain_utils::{self, TenantShardDrain, TenantShardIterator};
 use crate::heartbeater::{Heartbeater, PageserverState, SafekeeperState};
 use crate::id_lock_map::{
    IdLockMap, TracingExclusiveGuard, trace_exclusive_lock, trace_shared_lock,
@@ -79,6 +78,7 @@ use crate::id_lock_map::{
 use crate::leadership::Leadership;
 use crate::metrics;
 use crate::node::{AvailabilityTransition, Node};
+use crate::operation_utils::{self, TenantShardDrain};
 use crate::pageserver_client::PageserverClient;
 use crate::peer_client::GlobalObservedState;
 use crate::persistence::split_state::SplitState;
@@ -105,7 +105,7 @@ use crate::timeline_import::{
    TimelineImportFinalizeError, TimelineImportState, UpcallClient,
 };

-const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
+const WAITER_OPERATION_POLL_TIMEOUT: Duration = Duration::from_millis(500);

 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -210,6 +210,10 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;

+// Number of consecutive reconciliation errors, occured for one shard,
+// after which the shard is ignored when considering to run optimizations.
+const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5;
+
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
 // than they're being pushed onto the queue.
@@ -577,7 +581,9 @@ impl From<ReconcileWaitError> for ApiError {
 impl From<OperationError> for ApiError {
    fn from(value: OperationError) -> Self {
        match value {
-            OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => {
+            OperationError::NodeStateChanged(err)
+            | OperationError::FinalizeError(err)
+            | OperationError::ImpossibleConstraint(err) => {
                ApiError::InternalServerError(anyhow::anyhow!(err))
            }
            OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()),
@@ -702,6 +708,36 @@ struct ShardMutationLocations {
 #[derive(Default, Clone)]
 struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);

+struct ReconcileAllResult {
+    spawned_reconciles: usize,
+    keep_failing_reconciles: usize,
+    has_delayed_reconciles: bool,
+}
+
+impl ReconcileAllResult {
+    fn new(
+        spawned_reconciles: usize,
+        keep_failing_reconciles: usize,
+        has_delayed_reconciles: bool,
+    ) -> Self {
+        assert!(
+            spawned_reconciles >= keep_failing_reconciles,
+            "It is impossible to have more keep-failing reconciles than spawned reconciles"
+        );
+        Self {
+            spawned_reconciles,
+            keep_failing_reconciles,
+            has_delayed_reconciles,
+        }
+    }
+
+    /// We can run optimizations only if we don't have any delayed reconciles and
+    /// all spawned reconciles are also keep-failing reconciles.
+    fn can_run_optimizations(&self) -> bool {
+        !self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles
+    }
+}
+
 impl Service {
    pub fn get_config(&self) -> &Config {
        &self.config
@@ -899,7 +935,7 @@ impl Service {
        // which require it: under normal circumstances this should only include tenants that were in some
        // transient state before we restarted, or any tenants whose compute hooks failed above.
        tracing::info!("Checking for shards in need of reconciliation...");
-        let reconcile_tasks = self.reconcile_all();
+        let reconcile_all_result = self.reconcile_all();
        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
        // normal operations may proceed.

@@ -947,8 +983,9 @@ impl Service {
            }
        }

+        let spawned_reconciles = reconcile_all_result.spawned_reconciles;
        tracing::info!(
-            "Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"
+            "Startup complete, spawned {spawned_reconciles} reconciliation tasks ({shard_count} shards total)"
        );
    }

@@ -1199,8 +1236,8 @@ impl Service {
        while !self.reconcilers_cancel.is_cancelled() {
            tokio::select! {
              _ = interval.tick() => {
-                let reconciles_spawned = self.reconcile_all();
-                if reconciles_spawned == 0 {
+                let reconcile_all_result = self.reconcile_all();
+                if reconcile_all_result.can_run_optimizations() {
                    // Run optimizer only when we didn't find any other work to do
                    self.optimize_all().await;
                }
@@ -1214,7 +1251,7 @@ impl Service {
    }
    /// Heartbeat all storage nodes once in a while.
    #[instrument(skip_all)]
-    async fn spawn_heartbeat_driver(&self) {
+    async fn spawn_heartbeat_driver(self: &Arc<Self>) {
        self.startup_complete.clone().wait().await;

        let mut interval = tokio::time::interval(self.config.heartbeat_interval);
@@ -1341,18 +1378,51 @@ impl Service {
                }
            }
            if let Ok(deltas) = res_sk {
-                let mut locked = self.inner.write().unwrap();
-                let mut safekeepers = (*locked.safekeepers).clone();
-                for (id, state) in deltas.0 {
-                    let Some(sk) = safekeepers.get_mut(&id) else {
-                        tracing::info!(
-                            "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"
-                        );
-                        continue;
-                    };
-                    sk.set_availability(state);
+                let mut to_activate = Vec::new();
+                {
+                    let mut locked = self.inner.write().unwrap();
+                    let mut safekeepers = (*locked.safekeepers).clone();
+
+                    for (id, state) in deltas.0 {
+                        let Some(sk) = safekeepers.get_mut(&id) else {
+                            tracing::info!(
+                                "Couldn't update safekeeper safekeeper state for id {id} from heartbeat={state:?}"
+                            );
+                            continue;
+                        };
+                        if sk.scheduling_policy() == SkSchedulingPolicy::Activating
+                            && let SafekeeperState::Available { .. } = state
+                        {
+                            to_activate.push(id);
+                        }
+                        sk.set_availability(state);
+                    }
+                    locked.safekeepers = Arc::new(safekeepers);
+                }
+                for sk_id in to_activate {
+                    // TODO this can race with set_scheduling_policy (can create disjoint DB <-> in-memory state)
+                    tracing::info!("Activating safekeeper {sk_id}");
+                    match self.persistence.activate_safekeeper(sk_id.0 as i64).await {
+                        Ok(Some(())) => {}
+                        Ok(None) => {
+                            tracing::info!(
+                                "safekeeper {sk_id} has been removed from db or has different scheduling policy than active or activating"
+                            );
+                        }
+                        Err(e) => {
+                            tracing::warn!("couldn't apply activation of {sk_id} to db: {e}");
+                            continue;
+                        }
+                    }
+                    if let Err(e) = self
+                        .set_safekeeper_scheduling_policy_in_mem(sk_id, SkSchedulingPolicy::Active)
+                        .await
+                    {
+                        tracing::info!("couldn't activate safekeeper {sk_id} in memory: {e}");
+                        continue;
+                    }
+                    tracing::info!("Activation of safekeeper {sk_id} done");
                }
-                locked.safekeepers = Arc::new(safekeepers);
            }
        }
    }
@@ -1408,6 +1478,7 @@ impl Service {

        match result.result {
            Ok(()) => {
+                tenant.consecutive_errors_count = 0;
                tenant.apply_observed_deltas(deltas);
                tenant.waiter.advance(result.sequence);
            }
@@ -1426,6 +1497,8 @@ impl Service {
                    }
                }

+                tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1);
+
                // Ordering: populate last_error before advancing error_seq,
                // so that waiters will see the correct error after waiting.
                tenant.set_last_error(result.sequence, e);
@@ -2343,6 +2416,7 @@ impl Service {
                NodeSchedulingPolicy::PauseForRestart
                    | NodeSchedulingPolicy::Draining
                    | NodeSchedulingPolicy::Filling
+                    | NodeSchedulingPolicy::Deleting
            );

            let mut new_nodes = (**nodes).clone();
@@ -6984,7 +7058,7 @@ impl Service {
    /// If a node has any work on it, it will be rescheduled: this is "clean" in the sense
    /// that we don't leave any bad state behind in the storage controller, but unclean
    /// in the sense that we are not carefully draining the node.
-    pub(crate) async fn node_delete(&self, node_id: NodeId) -> Result<(), ApiError> {
+    pub(crate) async fn node_delete_old(&self, node_id: NodeId) -> Result<(), ApiError> {
        let _node_lock =
            trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Delete).await;

@@ -7018,7 +7092,7 @@ impl Service {
            }

            for (_tenant_id, mut schedule_context, shards) in
-                TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+                TenantShardExclusiveIterator::new(tenants, ScheduleMode::Normal)
            {
                for shard in shards {
                    if shard.deref_node(node_id) {
@@ -7087,6 +7161,171 @@ impl Service {
        Ok(())
    }

+    pub(crate) async fn delete_node(
+        self: &Arc<Self>,
+        node_id: NodeId,
+        policy_on_start: NodeSchedulingPolicy,
+        cancel: CancellationToken,
+    ) -> Result<(), OperationError> {
+        let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal).build();
+
+        let mut waiters: Vec<ReconcilerWaiter> = Vec::new();
+        let mut tid_iter = create_shared_shard_iterator(self.clone());
+
+        while !tid_iter.finished() {
+            if cancel.is_cancelled() {
+                match self
+                    .node_configure(node_id, None, Some(policy_on_start))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}",
+                                node_id, String::from(policy_on_start), err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
+            }
+
+            operation_utils::validate_node_state(
+                &node_id,
+                self.inner.read().unwrap().nodes.clone(),
+                NodeSchedulingPolicy::Deleting,
+            )?;
+
+            while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                let tid = match tid_iter.next() {
+                    Some(tid) => tid,
+                    None => {
+                        break;
+                    }
+                };
+
+                let mut locked = self.inner.write().unwrap();
+                let (nodes, tenants, scheduler) = locked.parts_mut();
+
+                let tenant_shard = match tenants.get_mut(&tid) {
+                    Some(tenant_shard) => tenant_shard,
+                    None => {
+                        // Tenant shard was deleted by another operation. Skip it.
+                        continue;
+                    }
+                };
+
+                match tenant_shard.get_scheduling_policy() {
+                    ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {
+                        // A migration during delete is classed as 'essential' because it is required to
+                        // uphold our availability goals for the tenant: this shard is elegible for migration.
+                    }
+                    ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
+                        // If we have been asked to avoid rescheduling this shard, then do not migrate it during a deletion
+                        tracing::warn!(
+                            tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
+                            "Skip migration during deletion because shard scheduling policy {:?} disallows it",
+                            tenant_shard.get_scheduling_policy(),
+                        );
+                        continue;
+                    }
+                }
+
+                if tenant_shard.deref_node(node_id) {
+                    // TODO(ephemeralsad): we should process all shards in a tenant at once, so
+                    // we can avoid settling the tenant unevenly.
+                    let mut schedule_context = ScheduleContext::new(ScheduleMode::Normal);
+                    if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
+                        tracing::error!(
+                            "Refusing to delete node, shard {} can't be rescheduled: {e}",
+                            tenant_shard.tenant_shard_id
+                        );
+                        return Err(OperationError::ImpossibleConstraint(e.to_string().into()));
+                    } else {
+                        tracing::info!(
+                            "Rescheduled shard {} away from node during deletion",
+                            tenant_shard.tenant_shard_id
+                        )
+                    }
+
+                    let waiter = self.maybe_configured_reconcile_shard(
+                        tenant_shard,
+                        nodes,
+                        reconciler_config,
+                    );
+                    if let Some(some) = waiter {
+                        waiters.push(some);
+                    }
+                }
+            }
+
+            waiters = self
+                .await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
+                .await;
+
+            failpoint_support::sleep_millis_async!("sleepy-delete-loop", &cancel);
+        }
+
+        while !waiters.is_empty() {
+            if cancel.is_cancelled() {
+                match self
+                    .node_configure(node_id, None, Some(policy_on_start))
+                    .await
+                {
+                    Ok(()) => return Err(OperationError::Cancelled),
+                    Err(err) => {
+                        return Err(OperationError::FinalizeError(
+                            format!(
+                                "Failed to finalise drain cancel of {} by setting scheduling policy to {}: {}",
+                                node_id, String::from(policy_on_start), err
+                            )
+                            .into(),
+                        ));
+                    }
+                }
+            }
+
+            tracing::info!("Awaiting {} pending delete reconciliations", waiters.len());
+
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        self.persistence
+            .set_tombstone(node_id)
+            .await
+            .map_err(|e| OperationError::FinalizeError(e.to_string().into()))?;
+
+        {
+            let mut locked = self.inner.write().unwrap();
+            let (nodes, _, scheduler) = locked.parts_mut();
+
+            scheduler.node_remove(node_id);
+
+            let mut nodes_mut = (**nodes).clone();
+            if let Some(mut removed_node) = nodes_mut.remove(&node_id) {
+                // Ensure that any reconciler holding an Arc<> to this node will
+                // drop out when trying to RPC to it (setting Offline state sets the
+                // cancellation token on the Node object).
+                removed_node.set_availability(NodeAvailability::Offline);
+            }
+            *nodes = Arc::new(nodes_mut);
+
+            metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_pageserver_nodes
+                .set(nodes.len() as i64);
+            metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_https_pageserver_nodes
+                .set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
+        }
+
+        Ok(())
+    }
+
    pub(crate) async fn node_list(&self) -> Result<Vec<Node>, ApiError> {
        let nodes = {
            self.inner
@@ -7475,7 +7714,7 @@ impl Service {
                let mut tenants_affected: usize = 0;

                for (_tenant_id, mut schedule_context, shards) in
-                    TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+                    TenantShardExclusiveIterator::new(tenants, ScheduleMode::Normal)
                {
                    for tenant_shard in shards {
                        let tenant_shard_id = tenant_shard.tenant_shard_id;
@@ -7646,6 +7885,142 @@ impl Service {
        self.node_configure(node_id, availability, scheduling).await
    }

+    pub(crate) async fn start_node_delete(
+        self: &Arc<Self>,
+        node_id: NodeId,
+    ) -> Result<(), ApiError> {
+        let (ongoing_op, node_policy, schedulable_nodes_count) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+            let schedulable_nodes_count = nodes
+                .iter()
+                .filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                .count();
+
+            (
+                locked
+                    .ongoing_operation
+                    .as_ref()
+                    .map(|ongoing| ongoing.operation),
+                node.get_scheduling(),
+                schedulable_nodes_count,
+            )
+        };
+
+        if let Some(ongoing) = ongoing_op {
+            return Err(ApiError::PreconditionFailed(
+                format!("Background operation already ongoing for node: {ongoing}").into(),
+            ));
+        }
+
+        if schedulable_nodes_count == 0 {
+            return Err(ApiError::PreconditionFailed(
+                "No other schedulable nodes to move shards".into(),
+            ));
+        }
+
+        match node_policy {
+            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
+                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Deleting))
+                    .await?;
+
+                let cancel = self.cancel.child_token();
+                let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
+                let policy_on_start = node_policy;
+
+                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
+                    operation: Operation::Delete(Delete { node_id }),
+                    cancel: cancel.clone(),
+                });
+
+                let span = tracing::info_span!(parent: None, "delete_node", %node_id);
+
+                tokio::task::spawn(
+                    {
+                        let service = self.clone();
+                        let cancel = cancel.clone();
+                        async move {
+                            let _gate_guard = gate_guard;
+
+                            scopeguard::defer! {
+                                let prev = service.inner.write().unwrap().ongoing_operation.take();
+
+                                if let Some(Operation::Delete(removed_delete)) = prev.map(|h| h.operation) {
+                                    assert_eq!(removed_delete.node_id, node_id, "We always take the same operation");
+                                } else {
+                                    panic!("We always remove the same operation")
+                                }
+                            }
+
+                            tracing::info!("Delete background operation starting");
+                            let res = service
+                                .delete_node(node_id, policy_on_start, cancel)
+                                .await;
+                            match res {
+                                Ok(()) => {
+                                    tracing::info!(
+                                        "Delete background operation completed successfully"
+                                    );
+                                }
+                                Err(OperationError::Cancelled) => {
+                                    tracing::info!("Delete background operation was cancelled");
+                                }
+                                Err(err) => {
+                                    tracing::error!(
+                                        "Delete background operation encountered: {err}"
+                                    )
+                                }
+                            }
+                        }
+                    }
+                    .instrument(span),
+                );
+            }
+            NodeSchedulingPolicy::Deleting => {
+                return Err(ApiError::Conflict(format!(
+                    "Node {node_id} has delete in progress"
+                )));
+            }
+            policy => {
+                return Err(ApiError::PreconditionFailed(
+                    format!("Node {node_id} cannot be deleted due to {policy:?} policy").into(),
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn cancel_node_delete(
+        self: &Arc<Self>,
+        node_id: NodeId,
+    ) -> Result<(), ApiError> {
+        {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+        }
+
+        if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
+            if let Operation::Delete(delete) = op_handler.operation {
+                if delete.node_id == node_id {
+                    tracing::info!("Cancelling background delete operation for node {node_id}");
+                    op_handler.cancel.cancel();
+                    return Ok(());
+                }
+            }
+        }
+
+        Err(ApiError::PreconditionFailed(
+            format!("Node {node_id} has no delete in progress").into(),
+        ))
+    }
+
    pub(crate) async fn start_node_drain(
        self: &Arc<Self>,
        node_id: NodeId,
@@ -8026,7 +8401,7 @@ impl Service {
    /// Returns how many reconciliation tasks were started, or `1` if no reconciles were
    /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where
    /// available.  A return value of 0 indicates that everything is fully reconciled already.
-    fn reconcile_all(&self) -> usize {
+    fn reconcile_all(&self) -> ReconcileAllResult {
        let mut locked = self.inner.write().unwrap();
        let (nodes, tenants, scheduler) = locked.parts_mut();
        let pageservers = nodes.clone();
@@ -8034,13 +8409,16 @@ impl Service {
        // This function is an efficient place to update lazy statistics, since we are walking
        // all tenants.
        let mut pending_reconciles = 0;
+        let mut keep_failing_reconciles = 0;
        let mut az_violations = 0;

        // If we find any tenants to drop from memory, stash them to offload after
        // we're done traversing the map of tenants.
        let mut drop_detached_tenants = Vec::new();

-        let mut reconciles_spawned = 0;
+        let mut spawned_reconciles = 0;
+        let mut has_delayed_reconciles = false;
+
        for shard in tenants.values_mut() {
            // Accumulate scheduling statistics
            if let (Some(attached), Some(preferred)) =
@@ -8060,18 +8438,32 @@ impl Service {
                // If there is something delayed, then return a nonzero count so that
                // callers like reconcile_all_now do not incorrectly get the impression
                // that the system is in a quiescent state.
-                reconciles_spawned = std::cmp::max(1, reconciles_spawned);
+                has_delayed_reconciles = true;
                pending_reconciles += 1;
                continue;
            }

            // Eventual consistency: if an earlier reconcile job failed, and the shard is still
            // dirty, spawn another one
+            let consecutive_errors_count = shard.consecutive_errors_count;
            if self
                .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
                .is_some()
            {
-                reconciles_spawned += 1;
+                spawned_reconciles += 1;
+
+                // Count shards that are keep-failing. We still want to reconcile them
+                // to avoid a situation where a shard is stuck.
+                // But we don't want to consider them when deciding to run optimizations.
+                if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS {
+                    tracing::warn!(
+                        tenant_id=%shard.tenant_shard_id.tenant_id,
+                        shard_id=%shard.tenant_shard_id.shard_slug(),
+                        "Shard reconciliation is keep-failing: {} errors",
+                        consecutive_errors_count
+                    );
+                    keep_failing_reconciles += 1;
+                }
            } else if shard.delayed_reconcile {
                // Shard wanted to reconcile but for some reason couldn't.
                pending_reconciles += 1;
@@ -8110,7 +8502,16 @@ impl Service {
            .storage_controller_pending_reconciles
            .set(pending_reconciles as i64);

-        reconciles_spawned
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_keep_failing_reconciles
+            .set(keep_failing_reconciles as i64);
+
+        ReconcileAllResult::new(
+            spawned_reconciles,
+            keep_failing_reconciles,
+            has_delayed_reconciles,
+        )
    }

    /// `optimize` in this context means identifying shards which have valid scheduled locations, but
@@ -8196,7 +8597,7 @@ impl Service {
        // to ignore the utilisation component of the score.

        for (_tenant_id, schedule_context, shards) in
-            TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
+            TenantShardExclusiveIterator::new(tenants, ScheduleMode::Speculative)
        {
            for shard in shards {
                if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
@@ -8783,13 +9184,13 @@ impl Service {
    /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
    /// put the system into a quiescent state where future background reconciliations won't do anything.
    pub(crate) async fn reconcile_all_now(&self) -> Result<usize, ReconcileWaitError> {
-        let reconciles_spawned = self.reconcile_all();
-        let reconciles_spawned = if reconciles_spawned == 0 {
+        let reconcile_all_result = self.reconcile_all();
+        let mut spawned_reconciles = reconcile_all_result.spawned_reconciles;
+        if reconcile_all_result.can_run_optimizations() {
            // Only optimize when we are otherwise idle
-            self.optimize_all().await
-        } else {
-            reconciles_spawned
-        };
+            let optimization_reconciles = self.optimize_all().await;
+            spawned_reconciles += optimization_reconciles;
+        }

        let waiters = {
            let mut waiters = Vec::new();
@@ -8826,11 +9227,11 @@ impl Service {

        tracing::info!(
            "{} reconciles in reconcile_all, {} waiters",
-            reconciles_spawned,
+            spawned_reconciles,
            waiter_count
        );

-        Ok(std::cmp::max(waiter_count, reconciles_spawned))
+        Ok(std::cmp::max(waiter_count, spawned_reconciles))
    }

    async fn stop_reconciliations(&self, reason: StopReconciliationsReason) {
@@ -8923,25 +9324,7 @@ impl Service {

        let mut waiters = Vec::new();

-        let mut tid_iter = TenantShardIterator::new({
-            let service = self.clone();
-            move |last_inspected_shard: Option<TenantShardId>| {
-                let locked = &service.inner.read().unwrap();
-                let tenants = &locked.tenants;
-                let entry = match last_inspected_shard {
-                    Some(skip_past) => {
-                        // Skip to the last seen tenant shard id
-                        let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
-
-                        // Skip past the last seen
-                        cursor.nth(1)
-                    }
-                    None => tenants.first_key_value(),
-                };
-
-                entry.map(|(tid, _)| tid).copied()
-            }
-        });
+        let mut tid_iter = create_shared_shard_iterator(self.clone());

        while !tid_iter.finished() {
            if cancel.is_cancelled() {
@@ -8961,7 +9344,11 @@ impl Service {
                }
            }

-            drain_utils::validate_node_state(&node_id, self.inner.read().unwrap().nodes.clone())?;
+            operation_utils::validate_node_state(
+                &node_id,
+                self.inner.read().unwrap().nodes.clone(),
+                NodeSchedulingPolicy::Draining,
+            )?;

            while waiters.len() < MAX_RECONCILES_PER_OPERATION {
                let tid = match tid_iter.next() {
@@ -9041,7 +9428,7 @@ impl Service {
            }

            waiters = self
-                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
                .await;

            failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
@@ -9335,7 +9722,7 @@ impl Service {
            }

            waiters = self
-                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_OPERATION_POLL_TIMEOUT)
                .await;
        }

--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -236,40 +236,30 @@ impl Service {
        F: std::future::Future<Output = mgmt_api::Result<T>> + Send + 'static,
        T: Sync + Send + 'static,
    {
+        let target_sk_count = safekeepers.len();
+
+        if target_sk_count == 0 {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "timeline configured without any safekeepers"
+            )));
+        }
+
+        if target_sk_count < self.config.timeline_safekeeper_count {
+            tracing::warn!(
+                "running a quorum operation with {} safekeepers, which is less than configured {} safekeepers per timeline",
+                target_sk_count,
+                self.config.timeline_safekeeper_count
+            );
+        }
+
        let results = self
            .tenant_timeline_safekeeper_op(safekeepers, op, timeout)
            .await?;

        // Now check if quorum was reached in results.

-        let target_sk_count = safekeepers.len();
-        let quorum_size = match target_sk_count {
-            0 => {
-                return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                    "timeline configured without any safekeepers",
-                )));
-            }
-            1 | 2 => {
-                #[cfg(feature = "testing")]
-                {
-                    // In test settings, it is allowed to have one or two safekeepers
-                    target_sk_count
-                }
-                #[cfg(not(feature = "testing"))]
-                {
-                    // The region is misconfigured: we need at least three safekeepers to be configured
-                    // in order to schedule work to them
-                    tracing::warn!(
-                        "couldn't find at least 3 safekeepers for timeline, found: {:?}",
-                        target_sk_count
-                    );
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                        "couldn't find at least 3 safekeepers to put timeline to"
-                    )));
-                }
-            }
-            _ => target_sk_count / 2 + 1,
-        };
+        let quorum_size = target_sk_count / 2 + 1;
+
        let success_count = results.iter().filter(|res| res.is_ok()).count();
        if success_count < quorum_size {
            // Failure
@@ -815,7 +805,7 @@ impl Service {
                        Safekeeper::from_persistence(
                            crate::persistence::SafekeeperPersistence::from_upsert(
                                record,
-                                SkSchedulingPolicy::Pause,
+                                SkSchedulingPolicy::Activating,
                            ),
                            CancellationToken::new(),
                            use_https,
@@ -856,27 +846,36 @@ impl Service {
            .await?;
        let node_id = NodeId(id as u64);
        // After the change has been persisted successfully, update the in-memory state
-        {
-            let mut locked = self.inner.write().unwrap();
-            let mut safekeepers = (*locked.safekeepers).clone();
-            let sk = safekeepers
-                .get_mut(&node_id)
-                .ok_or(DatabaseError::Logical("Not found".to_string()))?;
-            sk.set_scheduling_policy(scheduling_policy);
+        self.set_safekeeper_scheduling_policy_in_mem(node_id, scheduling_policy)
+            .await
+    }

-            match scheduling_policy {
-                SkSchedulingPolicy::Active => {
-                    locked
-                        .safekeeper_reconcilers
-                        .start_reconciler(node_id, self);
-                }
-                SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => {
-                    locked.safekeeper_reconcilers.stop_reconciler(node_id);
-                }
+    pub(crate) async fn set_safekeeper_scheduling_policy_in_mem(
+        self: &Arc<Service>,
+        node_id: NodeId,
+        scheduling_policy: SkSchedulingPolicy,
+    ) -> Result<(), DatabaseError> {
+        let mut locked = self.inner.write().unwrap();
+        let mut safekeepers = (*locked.safekeepers).clone();
+        let sk = safekeepers
+            .get_mut(&node_id)
+            .ok_or(DatabaseError::Logical("Not found".to_string()))?;
+        sk.set_scheduling_policy(scheduling_policy);
+
+        match scheduling_policy {
+            SkSchedulingPolicy::Active => {
+                locked
+                    .safekeeper_reconcilers
+                    .start_reconciler(node_id, self);
+            }
+            SkSchedulingPolicy::Decomissioned
+            | SkSchedulingPolicy::Pause
+            | SkSchedulingPolicy::Activating => {
+                locked.safekeeper_reconcilers.stop_reconciler(node_id);
            }
-
-            locked.safekeepers = Arc::new(safekeepers);
        }
+
+        locked.safekeepers = Arc::new(safekeepers);
        Ok(())
    }

--- a/storage_controller/src/service/tenant_shard_iterator.rs
+++ b/storage_controller/src/service/tenant_shard_iterator.rs
@@ -1,4 +1,5 @@
 use std::collections::BTreeMap;
+use std::sync::Arc;

 use utils::id::TenantId;
 use utils::shard::TenantShardId;
@@ -6,16 +7,21 @@ use utils::shard::TenantShardId;
 use crate::scheduler::{ScheduleContext, ScheduleMode};
 use crate::tenant_shard::TenantShard;

+use super::Service;
+
+/// Exclusive iterator over all tenant shards.
+/// It is used to iterate over consistent tenants state at specific point in time.
+///
 /// When making scheduling decisions, it is useful to have the ScheduleContext for a whole
 /// tenant while considering the individual shards within it.  This iterator is a helper
 /// that gathers all the shards in a tenant and then yields them together with a ScheduleContext
 /// for the tenant.
-pub(super) struct TenantShardContextIterator<'a> {
+pub(super) struct TenantShardExclusiveIterator<'a> {
    schedule_mode: ScheduleMode,
    inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>,
 }

-impl<'a> TenantShardContextIterator<'a> {
+impl<'a> TenantShardExclusiveIterator<'a> {
    pub(super) fn new(
        tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
        schedule_mode: ScheduleMode,
@@ -27,7 +33,7 @@ impl<'a> TenantShardContextIterator<'a> {
    }
 }

-impl<'a> Iterator for TenantShardContextIterator<'a> {
+impl<'a> Iterator for TenantShardExclusiveIterator<'a> {
    type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>);

    fn next(&mut self) -> Option<Self::Item> {
@@ -52,13 +58,93 @@ impl<'a> Iterator for TenantShardContextIterator<'a> {
    }
 }

+/// Shared iterator over all tenant shards.
+/// It is used to iterate over all tenants without blocking another code, working with tenants
+///
+/// A simple iterator which can be used in tandem with [`crate::service::Service`]
+/// to iterate over all known tenant shard ids without holding the lock on the
+/// service state at all times.
+pub(crate) struct TenantShardSharedIterator<F> {
+    tenants_accessor: F,
+    inspected_all_shards: bool,
+    last_inspected_shard: Option<TenantShardId>,
+}
+
+impl<F> TenantShardSharedIterator<F>
+where
+    F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
+{
+    pub(crate) fn new(tenants_accessor: F) -> Self {
+        Self {
+            tenants_accessor,
+            inspected_all_shards: false,
+            last_inspected_shard: None,
+        }
+    }
+
+    pub(crate) fn finished(&self) -> bool {
+        self.inspected_all_shards
+    }
+}
+
+impl<F> Iterator for TenantShardSharedIterator<F>
+where
+    F: Fn(Option<TenantShardId>) -> Option<TenantShardId>,
+{
+    // TODO(ephemeralsad): consider adding schedule context to the iterator
+    type Item = TenantShardId;
+
+    /// Returns the next tenant shard id if one exists
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.inspected_all_shards {
+            return None;
+        }
+
+        match (self.tenants_accessor)(self.last_inspected_shard) {
+            Some(tid) => {
+                self.last_inspected_shard = Some(tid);
+                Some(tid)
+            }
+            None => {
+                self.inspected_all_shards = true;
+                None
+            }
+        }
+    }
+}
+
+pub(crate) fn create_shared_shard_iterator(
+    service: Arc<Service>,
+) -> TenantShardSharedIterator<impl Fn(Option<TenantShardId>) -> Option<TenantShardId>> {
+    let tenants_accessor = move |last_inspected_shard: Option<TenantShardId>| {
+        let locked = &service.inner.read().unwrap();
+        let tenants = &locked.tenants;
+        let entry = match last_inspected_shard {
+            Some(skip_past) => {
+                // Skip to the last seen tenant shard id
+                let mut cursor = tenants.iter().skip_while(|(tid, _)| **tid != skip_past);
+
+                // Skip past the last seen
+                cursor.nth(1)
+            }
+            None => tenants.first_key_value(),
+        };
+
+        entry.map(|(tid, _)| tid).copied()
+    };
+
+    TenantShardSharedIterator::new(tenants_accessor)
+}
+
 #[cfg(test)]
 mod tests {
    use std::collections::BTreeMap;
    use std::str::FromStr;
+    use std::sync::Arc;

    use pageserver_api::controller_api::PlacementPolicy;
-    use utils::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+    use utils::shard::{ShardCount, ShardNumber, TenantShardId};

    use super::*;
    use crate::scheduler::test_utils::make_test_nodes;
@@ -66,7 +152,7 @@ mod tests {
    use crate::tenant_shard::tests::make_test_tenant_with_id;

    #[test]
-    fn test_context_iterator() {
+    fn test_exclusive_shard_iterator() {
        // Hand-crafted tenant IDs to ensure they appear in the expected order when put into
        // a btreemap & iterated
        let mut t_1_shards = make_test_tenant_with_id(
@@ -106,7 +192,7 @@ mod tests {
            shard.schedule(&mut scheduler, &mut context).unwrap();
        }

-        let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative);
+        let mut iter = TenantShardExclusiveIterator::new(&mut tenants, ScheduleMode::Speculative);
        let (tenant_id, context, shards) = iter.next().unwrap();
        assert_eq!(tenant_id, t1_id);
        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
@@ -132,4 +218,46 @@ mod tests {
            shard.intent.clear(&mut scheduler);
        }
    }
+
+    #[test]
+    fn test_shared_shard_iterator() {
+        let tenant_id = TenantId::generate();
+        let shard_count = ShardCount(8);
+
+        let mut tenant_shards = Vec::default();
+        for i in 0..shard_count.0 {
+            tenant_shards.push((
+                TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(i),
+                    shard_count,
+                },
+                (),
+            ))
+        }
+
+        let tenant_shards = Arc::new(tenant_shards);
+
+        let tid_iter = TenantShardSharedIterator::new({
+            let tenants = tenant_shards.clone();
+            move |last_inspected_shard: Option<TenantShardId>| {
+                let entry = match last_inspected_shard {
+                    Some(skip_past) => {
+                        let mut cursor = tenants.iter().skip_while(|(tid, _)| *tid != skip_past);
+                        cursor.nth(1)
+                    }
+                    None => tenants.first(),
+                };
+
+                entry.map(|(tid, _)| tid).copied()
+            }
+        });
+
+        let mut iterated_over = Vec::default();
+        for tid in tid_iter {
+            iterated_over.push((tid, ()));
+        }
+
+        assert_eq!(iterated_over, *tenant_shards);
+    }
 }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -131,6 +131,15 @@ pub(crate) struct TenantShard {
    #[serde(serialize_with = "read_last_error")]
    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,

+    /// Number of consecutive reconciliation errors that have occurred for this shard.
+    ///
+    /// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard
+    /// will be countered as keep-failing in `reconcile_all` calculations. This will lead to
+    /// allowing optimizations to run even with some failing shards.
+    ///
+    /// The counter is reset to 0 after a successful reconciliation.
+    pub(crate) consecutive_errors_count: usize,
+
    /// If we have a pending compute notification that for some reason we weren't able to send,
    /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
    /// and trigger a Reconciler run.  This is the mechanism by which compute notifications are included in the scope
@@ -594,6 +603,7 @@ impl TenantShard {
            waiter: Arc::new(SeqWait::new(Sequence(0))),
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
+            consecutive_errors_count: 0,
            pending_compute_notification: false,
            scheduling_policy: ShardSchedulingPolicy::default(),
            preferred_node: None,
@@ -1859,6 +1869,7 @@ impl TenantShard {
            waiter: Arc::new(SeqWait::new(Sequence::initial())),
            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
            last_error: Arc::default(),
+            consecutive_errors_count: 0,
            pending_compute_notification: false,
            delayed_reconcile: false,
            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),