storage controller: background reconcile, graceful shutdown, better logging (#6709)

## Problem Now that the storage controller is working end to end, we start burning down the robustness aspects. ## Summary of changes - Add a background task that periodically calls `reconcile_all`. This ensures that if earlier operations couldn't succeed (e.g. because a node was unavailable), we will eventually retry. This is a naive initial implementation can start an unlimited number of reconcile tasks: limiting reconcile concurrency is a later item in #6342 - Add a number of tracing spans in key locations: each background task, each reconciler task. - Add a top level CancellationToken and Gate, and use these to implement a graceful shutdown that waits for tasks to shut down. This is not bulletproof yet, because within these tasks we have remote HTTP calls that aren't wrapped in cancellation/timeouts, but it creates the structure, and if we don't shutdown promptly then k8s will kill us. - To protect shard splits from background reconciliation, expose the `SplitState` in memory and use it to guard any APIs that require an attached tenant.
2026-01-09 14:32:57 +00:00 · 2024-02-16 13:00:53 +00:00
parent 568bc1fde3
commit f2e5212fed
9 changed files with 370 additions and 156 deletions
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -4,6 +4,11 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

+[features]
+default = []
+# Enables test-only APIs and behaviors
+testing = []
+
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -155,7 +155,7 @@ impl ComputeHook {

        for (endpoint_name, endpoint) in &cplane.endpoints {
            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
-                tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint.reconfigure(compute_pageservers.clone()).await?;
            }
        }
@@ -177,7 +177,7 @@ impl ComputeHook {
            req
        };

-        tracing::debug!(
+        tracing::info!(
            "Sending notify request to {} ({:?})",
            url,
            reconfigure_request
@@ -266,7 +266,7 @@ impl ComputeHook {
    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
    /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
    pub(super) async fn notify(
        &self,
        tenant_shard_id: TenantShardId,
@@ -298,7 +298,7 @@ impl ComputeHook {
        let Some(reconfigure_request) = reconfigure_request else {
            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
            // until it does.
-            tracing::debug!("Tenant isn't yet ready to emit a notification",);
+            tracing::info!("Tenant isn't yet ready to emit a notification");
            return Ok(());
        };

--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -37,6 +37,12 @@ impl std::fmt::Display for Sequence {
    }
 }

+impl std::fmt::Debug for Sequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
 impl MonotonicCounter<Sequence> for Sequence {
    fn cnt_advance(&mut self, v: Sequence) {
        assert!(*self <= v);
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -15,6 +15,7 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
 use tokio::signal::unix::SignalKind;
+use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

@@ -237,15 +238,23 @@ async fn async_main() -> anyhow::Result<()> {
    let auth = secrets
        .public_key
        .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service, auth)
+    let router = make_router(service.clone(), auth)
        .build()
        .map_err(|err| anyhow!(err))?;
    let router_service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);

+    // Start HTTP server
+    let server_shutdown = CancellationToken::new();
+    let server = hyper::Server::from_tcp(http_listener)?
+        .serve(router_service)
+        .with_graceful_shutdown({
+            let server_shutdown = server_shutdown.clone();
+            async move {
+                server_shutdown.cancelled().await;
+            }
+        });
    tracing::info!("Serving on {0}", args.listen);
-
-    tokio::task::spawn(server);
+    let server_task = tokio::task::spawn(server);

    // Wait until we receive a signal
    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
@@ -266,5 +275,16 @@ async fn async_main() -> anyhow::Result<()> {
        }
    }

+    // Stop HTTP server first, so that we don't have to service requests
+    // while shutting down Service
+    server_shutdown.cancel();
+    if let Err(e) = server_task.await {
+        tracing::error!("Error joining HTTP server task: {e}")
+    }
+    tracing::info!("Joined HTTP server task");
+
+    service.shutdown().await;
+    tracing::info!("Service shutdown complete");
+
    std::process::exit(0);
 }
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -13,6 +13,7 @@ use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
+use utils::sync::gate::GateGuard;

 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
@@ -53,6 +54,10 @@ pub(super) struct Reconciler {
    /// the tenant is changed.
    pub(crate) cancel: CancellationToken,

+    /// Reconcilers are registered with a Gate so that during a graceful shutdown we
+    /// can wait for all the reconcilers to respond to their cancellation tokens.
+    pub(crate) _gate_guard: GateGuard,
+
    /// Access to persistent storage for updating generation numbers
    pub(crate) persistence: Arc<Persistence>,
 }
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -77,12 +77,11 @@ impl Scheduler {
            return Err(ScheduleError::ImpossibleConstraint);
        }

-        for (node_id, count) in &tenant_counts {
-            tracing::info!("tenant_counts[{node_id}]={count}");
-        }
-
        let node_id = tenant_counts.first().unwrap().0;
-        tracing::info!("scheduler selected node {node_id}");
+        tracing::info!(
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+        );
        *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
        Ok(node_id)
    }
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -30,6 +30,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
+use tracing::instrument;
 use utils::{
    backoff,
    completion::Barrier,
@@ -37,6 +38,7 @@ use utils::{
    http::error::ApiError,
    id::{NodeId, TenantId, TimelineId},
    seqwait::SeqWait,
+    sync::gate::Gate,
 };

 use crate::{
@@ -124,6 +126,12 @@ pub struct Service {
    config: Config,
    persistence: Arc<Persistence>,

+    // Process shutdown will fire this token
+    cancel: CancellationToken,
+
+    // Background tasks will hold this gate
+    gate: Gate,
+
    /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
    /// passes, it isn't safe to do any actions that mutate tenants.
    pub(crate) startup_complete: Barrier,
@@ -144,8 +152,9 @@ impl Service {
        &self.config
    }

-    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
-    /// until this is done.
+    /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
+    /// view of the world, and determine which pageservers are responsive.
+    #[instrument(skip_all)]
    async fn startup_reconcile(&self) {
        // For all tenant shards, a vector of observed states on nodes (where None means
        // indeterminate, same as in [`ObservedStateLocation`])
@@ -153,9 +162,6 @@ impl Service {

        let mut nodes_online = HashSet::new();

-        // TODO: give Service a cancellation token for clean shutdown
-        let cancel = CancellationToken::new();
-
        // TODO: issue these requests concurrently
        {
            let nodes = {
@@ -190,7 +196,7 @@ impl Service {
                    1,
                    5,
                    "Location config listing",
-                    &cancel,
+                    &self.cancel,
                )
                .await;
                let Some(list_response) = list_response else {
@@ -331,7 +337,7 @@ impl Service {
        let stream = futures::stream::iter(compute_notifications.into_iter())
            .map(|(tenant_shard_id, node_id)| {
                let compute_hook = compute_hook.clone();
-                let cancel = cancel.clone();
+                let cancel = self.cancel.clone();
                async move {
                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                        tracing::error!(
@@ -368,8 +374,98 @@ impl Service {
        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
    }

+    /// Long running background task that periodically wakes up and looks for shards that need
+    /// reconciliation.  Reconciliation is fallible, so any reconciliation tasks that fail during
+    /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
+    /// for those retries.
+    #[instrument(skip_all)]
+    async fn background_reconcile(&self) {
+        self.startup_complete.clone().wait().await;
+
+        const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
+
+        let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
+        while !self.cancel.is_cancelled() {
+            tokio::select! {
+              _ = interval.tick() => { self.reconcile_all(); }
+              _ = self.cancel.cancelled() => return
+            }
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn process_results(
+        &self,
+        mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
+    ) {
+        loop {
+            // Wait for the next result, or for cancellation
+            let result = tokio::select! {
+                r = result_rx.recv() => {
+                    match r {
+                        Some(result) => {result},
+                        None => {break;}
+                    }
+                }
+                _ = self.cancel.cancelled() => {
+                    break;
+                }
+            };
+
+            tracing::info!(
+                "Reconcile result for sequence {}, ok={}",
+                result.sequence,
+                result.result.is_ok()
+            );
+            let mut locked = self.inner.write().unwrap();
+            let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+                // A reconciliation result might race with removing a tenant: drop results for
+                // tenants that aren't in our map.
+                continue;
+            };
+
+            // Usually generation should only be updated via this path, so the max() isn't
+            // needed, but it is used to handle out-of-band updates via. e.g. test hook.
+            tenant.generation = std::cmp::max(tenant.generation, result.generation);
+
+            // If the reconciler signals that it failed to notify compute, set this state on
+            // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+            tenant.pending_compute_notification = result.pending_compute_notification;
+
+            match result.result {
+                Ok(()) => {
+                    for (node_id, loc) in &result.observed.locations {
+                        if let Some(conf) = &loc.conf {
+                            tracing::info!("Updating observed location {}: {:?}", node_id, conf);
+                        } else {
+                            tracing::info!("Setting observed location {} to None", node_id,)
+                        }
+                    }
+                    tenant.observed = result.observed;
+                    tenant.waiter.advance(result.sequence);
+                }
+                Err(e) => {
+                    tracing::warn!(
+                        "Reconcile error on tenant {}: {}",
+                        tenant.tenant_shard_id,
+                        e
+                    );
+
+                    // Ordering: populate last_error before advancing error_seq,
+                    // so that waiters will see the correct error after waiting.
+                    *(tenant.last_error.lock().unwrap()) = format!("{e}");
+                    tenant.error_waiter.advance(result.sequence);
+
+                    for (node_id, o) in result.observed.locations {
+                        tenant.observed.locations.insert(node_id, o);
+                    }
+                }
+            }
+        }
+    }
+
    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();

        tracing::info!("Loading nodes from database...");
        let nodes = persistence.list_nodes().await?;
@@ -418,6 +514,7 @@ impl Service {
                observed: ObservedState::new(),
                config: serde_json::from_str(&tsp.config).unwrap(),
                reconciler: None,
+                splitting: tsp.splitting,
                waiter: Arc::new(SeqWait::new(Sequence::initial())),
                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
                last_error: Arc::default(),
@@ -439,73 +536,35 @@ impl Service {
            config,
            persistence,
            startup_complete: startup_complete.clone(),
+            cancel: CancellationToken::new(),
+            gate: Gate::default(),
        });

        let result_task_this = this.clone();
        tokio::task::spawn(async move {
-            while let Some(result) = result_rx.recv().await {
-                tracing::info!(
-                    "Reconcile result for sequence {}, ok={}",
-                    result.sequence,
-                    result.result.is_ok()
-                );
-                let mut locked = result_task_this.inner.write().unwrap();
-                let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
-                    // A reconciliation result might race with removing a tenant: drop results for
-                    // tenants that aren't in our map.
-                    continue;
-                };
-
-                // Usually generation should only be updated via this path, so the max() isn't
-                // needed, but it is used to handle out-of-band updates via. e.g. test hook.
-                tenant.generation = std::cmp::max(tenant.generation, result.generation);
-
-                // If the reconciler signals that it failed to notify compute, set this state on
-                // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
-                tenant.pending_compute_notification = result.pending_compute_notification;
-
-                match result.result {
-                    Ok(()) => {
-                        for (node_id, loc) in &result.observed.locations {
-                            if let Some(conf) = &loc.conf {
-                                tracing::info!(
-                                    "Updating observed location {}: {:?}",
-                                    node_id,
-                                    conf
-                                );
-                            } else {
-                                tracing::info!("Setting observed location {} to None", node_id,)
-                            }
-                        }
-                        tenant.observed = result.observed;
-                        tenant.waiter.advance(result.sequence);
-                    }
-                    Err(e) => {
-                        tracing::warn!(
-                            "Reconcile error on tenant {}: {}",
-                            tenant.tenant_shard_id,
-                            e
-                        );
-
-                        // Ordering: populate last_error before advancing error_seq,
-                        // so that waiters will see the correct error after waiting.
-                        *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                        tenant.error_waiter.advance(result.sequence);
-
-                        for (node_id, o) in result.observed.locations {
-                            tenant.observed.locations.insert(node_id, o);
-                        }
-                    }
-                }
+            // Block shutdown until we're done (we must respect self.cancel)
+            if let Ok(_gate) = result_task_this.gate.enter() {
+                result_task_this.process_results(result_rx).await
            }
        });

-        let startup_reconcile_this = this.clone();
-        tokio::task::spawn(async move {
-            // Block the [`Service::startup_complete`] barrier until we're done
-            let _completion = startup_completion;
+        tokio::task::spawn({
+            let this = this.clone();
+            // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`]
+            // is done.
+            let startup_completion = startup_completion.clone();
+            async move {
+                // Block shutdown until we're done (we must respect self.cancel)
+                let Ok(_gate) = this.gate.enter() else {
+                    return;
+                };

-            startup_reconcile_this.startup_reconcile().await
+                this.startup_reconcile().await;
+
+                drop(startup_completion);
+
+                this.background_reconcile().await;
+            }
        });

        Ok(this)
@@ -620,6 +679,28 @@ impl Service {
            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
        );

+        // Trick the reconciler into not doing anything for this tenant: this helps
+        // tests that manually configure a tenant on the pagesrever, and then call this
+        // attach hook: they don't want background reconciliation to modify what they
+        // did to the pageserver.
+        #[cfg(feature = "testing")]
+        {
+            if let Some(node_id) = attach_req.node_id {
+                tenant_state.observed.locations = HashMap::from([(
+                    node_id,
+                    ObservedStateLocation {
+                        conf: Some(attached_location_conf(
+                            tenant_state.generation,
+                            &tenant_state.shard,
+                            &tenant_state.config,
+                        )),
+                    },
+                )]);
+            } else {
+                tenant_state.observed.locations.clear();
+            }
+        }
+
        Ok(AttachHookResponse {
            gen: attach_req
                .node_id
@@ -868,6 +949,8 @@ impl Service {
                        &compute_hook,
                        &self.config,
                        &self.persistence,
+                        &self.gate,
+                        &self.cancel,
                    )
                })
                .collect::<Vec<_>>();
@@ -970,6 +1053,8 @@ impl Service {
                    &compute_hook,
                    &self.config,
                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
                );
                if let Some(waiter) = maybe_waiter {
                    waiters.push(waiter);
@@ -1059,6 +1144,8 @@ impl Service {
    }

    pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        self.ensure_attached_wait(tenant_id).await?;
+
        // TODO: refactor into helper
        let targets = {
            let locked = self.inner.read().unwrap();
@@ -1080,8 +1167,6 @@ impl Service {
            targets
        };

-        // TODO: error out if the tenant is not attached anywhere.
-
        // Phase 1: delete on the pageservers
        let mut any_pending = false;
        for (tenant_shard_id, node) in targets {
@@ -1417,9 +1502,6 @@ impl Service {
        let mut policy = None;
        let mut shard_ident = None;

-        // TODO: put a cancellation token on Service for clean shutdown
-        let cancel = CancellationToken::new();
-
        // A parent shard which will be split
        struct SplitTarget {
            parent_id: TenantShardId,
@@ -1591,6 +1673,18 @@ impl Service {
            }
        }

+        // Now that I have persisted the splitting state, apply it in-memory.  This is infallible, so
+        // callers may assume that if splitting is set in memory, then it was persisted, and if splitting
+        // is not set in memory, then it was not persisted.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for target in &targets {
+                if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) {
+                    parent_shard.splitting = SplitState::Splitting;
+                }
+            }
+        }
+
        // FIXME: we have now committed the shard split state to the database, so any subsequent
        // failure needs to roll it back.  We will later wrap this function in logic to roll back
        // the split if it fails.
@@ -1650,7 +1744,7 @@ impl Service {
            .complete_shard_split(tenant_id, old_shard_count)
            .await?;

-        // Replace all the shards we just split with their children
+        // Replace all the shards we just split with their children: this phase is infallible.
        let mut response = TenantShardSplitResponse {
            new_shards: Vec::new(),
        };
@@ -1698,6 +1792,10 @@ impl Service {
                    child_state.generation = generation;
                    child_state.config = config.clone();

+                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // as at this point in the split process we have succeeded and this part is infallible:
+                    // we will never need to do any special recovery from this state.
+
                    child_locations.push((child, pageserver));

                    locked.tenants.insert(child, child_state);
@@ -1709,7 +1807,7 @@ impl Service {
        // Send compute notifications for all the new shards
        let mut failed_notifications = Vec::new();
        for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
+            if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
                        child_id, child_ps);
                failed_notifications.push(child_id);
@@ -1785,6 +1883,8 @@ impl Service {
                &compute_hook,
                &self.config,
                &self.persistence,
+                &self.gate,
+                &self.cancel,
            )
        };

@@ -1986,6 +2086,8 @@ impl Service {
                                &compute_hook,
                                &self.config,
                                &self.persistence,
+                                &self.gate,
+                                &self.cancel,
                            );
                        }
                    }
@@ -2007,6 +2109,8 @@ impl Service {
                            &compute_hook,
                            &self.config,
                            &self.persistence,
+                            &self.gate,
+                            &self.cancel,
                        );
                    }
                }
@@ -2046,6 +2150,8 @@ impl Service {
                &compute_hook,
                &self.config,
                &self.persistence,
+                &self.gate,
+                &self.cancel,
            ) {
                waiters.push(waiter);
            }
@@ -2057,6 +2163,17 @@ impl Service {
        let ensure_waiters = {
            let locked = self.inner.write().unwrap();

+            // Check if the tenant is splitting: in this case, even if it is attached,
+            // we must act as if it is not: this blocks e.g. timeline creation/deletion
+            // operations during the split.
+            for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+                if !matches!(shard.splitting, SplitState::Idle) {
+                    return Err(ApiError::ResourceUnavailable(
+                        "Tenant shards are currently splitting".into(),
+                    ));
+                }
+            }
+
            self.ensure_attached_schedule(locked, tenant_id)
                .map_err(ApiError::InternalServerError)?
        };
@@ -2088,8 +2205,25 @@ impl Service {
                    &compute_hook,
                    &self.config,
                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
                )
            })
            .count()
    }
+
+    pub async fn shutdown(&self) {
+        // Note that this already stops processing any results from reconciles: so
+        // we do not expect that our [`TenantState`] objects will reach a neat
+        // final state.
+        self.cancel.cancel();
+
+        // The cancellation tokens in [`crate::reconciler::Reconciler`] are children
+        // of our cancellation token, so we do not need to explicitly cancel each of
+        // them.
+
+        // Background tasks and reconcilers hold gate guards: this waits for them all
+        // to complete.
+        self.gate.close().await;
+    }
 }
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,16 +7,18 @@ use pageserver_api::{
 };
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
+use tracing::{instrument, Instrument};
 use utils::{
    generation::Generation,
    id::NodeId,
    seqwait::{SeqWait, SeqWaitError},
+    sync::gate::Gate,
 };

 use crate::{
    compute_hook::ComputeHook,
    node::Node,
-    persistence::Persistence,
+    persistence::{split_state::SplitState, Persistence},
    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
    scheduler::{ScheduleError, Scheduler},
    service, PlacementPolicy, Sequence,
@@ -58,6 +60,11 @@ pub(crate) struct TenantState {
    /// cancellation token has been fired)
    pub(crate) reconciler: Option<ReconcilerHandle>,

+    /// If a tenant is being split, then all shards with that TenantId will have a
+    /// SplitState set, this acts as a guard against other operations such as background
+    /// reconciliation, and timeline creation.
+    pub(crate) splitting: SplitState,
+
    /// Optionally wait for reconciliation to complete up to a particular
    /// sequence number.
    pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
@@ -238,6 +245,7 @@ impl TenantState {
            observed: ObservedState::default(),
            config: TenantConfig::default(),
            reconciler: None,
+            splitting: SplitState::Idle,
            sequence: Sequence(1),
            waiter: Arc::new(SeqWait::new(Sequence(0))),
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
@@ -415,6 +423,8 @@ impl TenantState {
        false
    }

+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn maybe_reconcile(
        &mut self,
        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -422,6 +432,8 @@ impl TenantState {
        compute_hook: &Arc<ComputeHook>,
        service_config: &service::Config,
        persistence: &Arc<Persistence>,
+        gate: &Gate,
+        cancel: &CancellationToken,
    ) -> Option<ReconcilerWaiter> {
        // If there are any ambiguous observed states, and the nodes they refer to are available,
        // we should reconcile to clean them up.
@@ -443,6 +455,14 @@ impl TenantState {
            return None;
        }

+        // If we are currently splitting, then never start a reconciler task: the splitting logic
+        // requires that shards are not interfered with while it runs. Do this check here rather than
+        // up top, so that we only log this message if we would otherwise have done a reconciliation.
+        if !matches!(self.splitting, SplitState::Idle) {
+            tracing::info!("Refusing to reconcile, splitting in progress");
+            return None;
+        }
+
        // Reconcile already in flight for the current sequence?
        if let Some(handle) = &self.reconciler {
            if handle.sequence == self.sequence {
@@ -460,7 +480,12 @@ impl TenantState {
        // doing our sequence's work.
        let old_handle = self.reconciler.take();

-        let cancel = CancellationToken::new();
+        let Ok(gate_guard) = gate.enter() else {
+            // Shutting down, don't start a reconciler
+            return None;
+        };
+
+        let reconciler_cancel = cancel.child_token();
        let mut reconciler = Reconciler {
            tenant_shard_id: self.tenant_shard_id,
            shard: self.shard,
@@ -471,59 +496,66 @@ impl TenantState {
            pageservers: pageservers.clone(),
            compute_hook: compute_hook.clone(),
            service_config: service_config.clone(),
-            cancel: cancel.clone(),
+            _gate_guard: gate_guard,
+            cancel: reconciler_cancel.clone(),
            persistence: persistence.clone(),
            compute_notify_failure: false,
        };

        let reconcile_seq = self.sequence;

-        tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
        let must_notify = self.pending_compute_notification;
-        let join_handle = tokio::task::spawn(async move {
-            // Wait for any previous reconcile task to complete before we start
-            if let Some(old_handle) = old_handle {
-                old_handle.cancel.cancel();
-                if let Err(e) = old_handle.handle.await {
-                    // We can't do much with this other than log it: the task is done, so
-                    // we may proceed with our work.
-                    tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
+                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
+                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
+        let join_handle = tokio::task::spawn(
+            async move {
+                // Wait for any previous reconcile task to complete before we start
+                if let Some(old_handle) = old_handle {
+                    old_handle.cancel.cancel();
+                    if let Err(e) = old_handle.handle.await {
+                        // We can't do much with this other than log it: the task is done, so
+                        // we may proceed with our work.
+                        tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+                    }
                }
+
+                // Early check for cancellation before doing any work
+                // TODO: wrap all remote API operations in cancellation check
+                // as well.
+                if reconciler.cancel.is_cancelled() {
+                    return;
+                }
+
+                // Attempt to make observed state match intent state
+                let result = reconciler.reconcile().await;
+
+                // If we know we had a pending compute notification from some previous action, send a notification irrespective
+                // of whether the above reconcile() did any work
+                if result.is_ok() && must_notify {
+                    // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
+                    reconciler.compute_notify().await.ok();
+                }
+
+                result_tx
+                    .send(ReconcileResult {
+                        sequence: reconcile_seq,
+                        result,
+                        tenant_shard_id: reconciler.tenant_shard_id,
+                        generation: reconciler.generation,
+                        observed: reconciler.observed,
+                        pending_compute_notification: reconciler.compute_notify_failure,
+                    })
+                    .ok();
            }
-
-            // Early check for cancellation before doing any work
-            // TODO: wrap all remote API operations in cancellation check
-            // as well.
-            if reconciler.cancel.is_cancelled() {
-                return;
-            }
-
-            // Attempt to make observed state match intent state
-            let result = reconciler.reconcile().await;
-
-            // If we know we had a pending compute notification from some previous action, send a notification irrespective
-            // of whether the above reconcile() did any work
-            if result.is_ok() && must_notify {
-                // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
-                reconciler.compute_notify().await.ok();
-            }
-
-            result_tx
-                .send(ReconcileResult {
-                    sequence: reconcile_seq,
-                    result,
-                    tenant_shard_id: reconciler.tenant_shard_id,
-                    generation: reconciler.generation,
-                    observed: reconciler.observed,
-                    pending_compute_notification: reconciler.compute_notify_failure,
-                })
-                .ok();
-        });
+            .instrument(reconciler_span),
+        );

        self.reconciler = Some(ReconcilerHandle {
            sequence: self.sequence,
            handle: join_handle,
-            cancel,
+            cancel: reconciler_cancel,
        });

        Some(ReconcilerWaiter {
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -20,6 +20,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
+    NeonPageserver,
    PgBin,
    S3Scrubber,
    last_flush_lsn_upload,
@@ -62,7 +63,7 @@ def generate_uploads_and_deletions(
    tenant_id: Optional[TenantId] = None,
    timeline_id: Optional[TimelineId] = None,
    data: Optional[str] = None,
-    pageserver_id: Optional[int] = None,
+    pageserver: NeonPageserver,
 ):
    """
    Using the environment's default tenant + timeline, generate a load pattern
@@ -77,14 +78,16 @@ def generate_uploads_and_deletions(
        timeline_id = env.initial_timeline
    assert timeline_id is not None

-    ps_http = env.pageserver.http_client()
+    ps_http = pageserver.http_client()

    with env.endpoints.create_start(
-        "main", tenant_id=tenant_id, pageserver_id=pageserver_id
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
    ) as endpoint:
        if init:
            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )

        def churn(data):
            endpoint.safe_psql_many(
@@ -105,7 +108,9 @@ def generate_uploads_and_deletions(
            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
            # in a state where there are "future layers" in remote storage that will generate deletions
            # after a restart.
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
            ps_http.timeline_checkpoint(tenant_id, timeline_id)

        # Compaction should generate some GC-elegible layers
@@ -205,7 +210,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.create_tenant(
        tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
    )
-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)

    def parse_generation_suffix(key):
        m = re.match(".+-([0-9a-zA-Z]{8})$", key)
@@ -233,7 +238,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
    # Starting without the override that disabled control_plane_api
    env.pageserver.start()

-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id, init=False)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False)

    legacy_objects: list[str] = []
    suffixed_objects = []
@@ -277,13 +282,16 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.enable_pageserver_remote_storage(
        RemoteStorageKind.MOCK_S3,
    )
+    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)

-    some_other_pageserver = 1234
+    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    main_pageserver = env.get_pageserver(attached_to_id)
+    other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]

-    ps_http = env.pageserver.http_client()
+    ps_http = main_pageserver.http_client()

-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=main_pageserver)

    # Flush: pending deletions should all complete
    assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -296,14 +304,14 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
    assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

-    env.pageserver.allowed_errors.extend(
+    main_pageserver.allowed_errors.extend(
        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
    )

    # Now advance the generation in the control plane: subsequent validations
    # from the running pageserver will fail.  No more deletions should happen.
-    env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    env.attachment_service.attach_hook_issue(env.initial_tenant, other_pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver)

    assert_deletion_queue(ps_http, lambda n: n > 0)
    queue_depth_before = get_deletion_queue_depth(ps_http)
@@ -355,9 +363,14 @@ def test_deletion_queue_recovery(
    neon_env_builder.enable_pageserver_remote_storage(
        RemoteStorageKind.MOCK_S3,
    )
+    neon_env_builder.num_pageservers = 2
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)

-    ps_http = env.pageserver.http_client()
+    attached_to_id = env.attachment_service.locate(env.initial_tenant)[0]["node_id"]
+    main_pageserver = env.get_pageserver(attached_to_id)
+    other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0]
+
+    ps_http = main_pageserver.http_client()

    failpoints = [
        # Prevent deletion lists from being executed, to build up some backlog of deletions
@@ -374,7 +387,7 @@ def test_deletion_queue_recovery(

    ps_http.configure_failpoints(failpoints)

-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=main_pageserver)

    # There should be entries in the deletion queue
    assert_deletion_queue(ps_http, lambda n: n > 0)
@@ -401,7 +414,7 @@ def test_deletion_queue_recovery(
        # also wait to see the header hit the disk: this seems paranoid but the race
        # can really happen on a heavily overloaded test machine.
        def assert_header_written():
-            assert (env.pageserver.workdir / "deletion" / "header-01").exists()
+            assert (main_pageserver.workdir / "deletion" / "header-01").exists()

        wait_until(20, 1, assert_header_written)

@@ -411,13 +424,13 @@ def test_deletion_queue_recovery(
            before_restart_depth = get_deletion_queue_validated(ps_http)

    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
-    env.pageserver.stop(immediate=True)
+    main_pageserver.stop(immediate=True)

    if keep_attachment == KeepAttachment.LOSE:
-        some_other_pageserver = 101010
+        some_other_pageserver = other_pageserver.id
        env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)

-    env.pageserver.start()
+    main_pageserver.start()

    def assert_deletions_submitted(n: int):
        assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
@@ -440,7 +453,7 @@ def test_deletion_queue_recovery(
        #   validated before restart.
        assert get_deletion_queue_executed(ps_http) == before_restart_depth
    else:
-        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
+        main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])

        # If we lost the attachment, we should have dropped our pre-restart deletions.
        assert get_deletion_queue_dropped(ps_http) == before_restart_depth
@@ -449,8 +462,8 @@ def test_deletion_queue_recovery(
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

    # Restart again
-    env.pageserver.stop(immediate=True)
-    env.pageserver.start()
+    main_pageserver.stop(immediate=True)
+    main_pageserver.start()

    # No deletion lists should be recovered: this demonstrates that deletion lists
    # were cleaned up after being executed or dropped in the previous process lifetime.
@@ -469,7 +482,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):

    ps_http = env.pageserver.http_client()

-    generate_uploads_and_deletions(env, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)

    env.pageserver.allowed_errors.extend(
        [
@@ -486,7 +499,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    # Remember how many validations had happened before the control plane went offline
    validated = get_deletion_queue_validated(ps_http)

-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)

    # The running pageserver should stop progressing deletions
    time.sleep(10)
@@ -502,7 +515,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    )

    # The pageserver should provide service to clients
-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)

    # The pageserver should neither validate nor execute any deletions, it should have
    # loaded the DeletionLists from before though
@@ -523,7 +536,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
    env.pageserver.start()

-    generate_uploads_and_deletions(env, init=False, pageserver_id=env.pageserver.id)
+    generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver)
    ps_http.deletion_queue_flush(execute=True)
    assert get_deletion_queue_depth(ps_http) == 0
    assert get_deletion_queue_validated(ps_http) > 0
@@ -561,7 +574,7 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    generate_uploads_and_deletions(env)
+    generate_uploads_and_deletions(env, pageserver=env.pageserver)

    read_all(env, tenant_id, timeline_id)
    evict_all_layers(env, tenant_id, timeline_id)