tests: failure cases for shard splitting

neon_local: timeout on endpoint reconfigures
control_plane: failpoints HTTP endpoint
2026-03-19 16:20:36 +00:00 · 2024-03-08 19:16:12 +00:00 · 2024-03-08 19:16:12 +00:00 · 2024-03-08 19:16:12 +00:00 · 2024-03-08 19:16:12 +00:00 · 2024-03-08 19:16:12 +00:00
18 changed files with 1657 additions and 151 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -282,8 +282,10 @@ dependencies = [
 "control_plane",
 "diesel",
 "diesel_migrations",
+ "fail",
 "futures",
 "git-version",
+ "hex",
 "humantime",
 "hyper",
 "metrics",
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -19,8 +19,10 @@ aws-config.workspace = true
 aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
+fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
+hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
 once_cell.workspace = true
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -1,3 +1,4 @@
+use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};

 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -23,10 +24,13 @@ struct ShardedComputeHookTenant {
    stripe_size: ShardStripeSize,
    shard_count: ShardCount,
    shards: Vec<(ShardNumber, NodeId)>,
+
+    // Async lock used for ensuring that remote compute hook calls are ordered identically to updates to this structure
+    lock: Arc<tokio::sync::Mutex<()>>,
 }

 enum ComputeHookTenant {
-    Unsharded(NodeId),
+    Unsharded((NodeId, Arc<tokio::sync::Mutex<()>>)),
    Sharded(ShardedComputeHookTenant),
 }

@@ -38,9 +42,17 @@ impl ComputeHookTenant {
                shards: vec![(tenant_shard_id.shard_number, node_id)],
                stripe_size,
                shard_count: tenant_shard_id.shard_count,
+                lock: Arc::default(),
            })
        } else {
-            Self::Unsharded(node_id)
+            Self::Unsharded((node_id, Arc::default()))
+        }
+    }
+
+    fn get_lock(&self) -> &Arc<tokio::sync::Mutex<()>> {
+        match self {
+            Self::Unsharded((_node_id, lock)) => lock,
+            Self::Sharded(sharded_tenant) => &sharded_tenant.lock,
        }
    }

@@ -53,7 +65,9 @@ impl ComputeHookTenant {
        node_id: NodeId,
    ) {
        match self {
-            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+            Self::Unsharded((existing_node_id, _lock))
+                if tenant_shard_id.shard_count.count() == 1 =>
+            {
                *existing_node_id = node_id
            }
            Self::Sharded(sharded_tenant)
@@ -122,9 +136,15 @@ pub(crate) enum NotifyError {
 }

 impl ComputeHookTenant {
-    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        match self {
-            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
+    fn maybe_reconfigure(
+        &self,
+        tenant_id: TenantId,
+    ) -> Option<(
+        ComputeHookNotifyRequest,
+        impl std::future::Future<Output = tokio::sync::OwnedMutexGuard<()>>,
+    )> {
+        let request = match self {
+            Self::Unsharded((node_id, _lock)) => Some(ComputeHookNotifyRequest {
                tenant_id,
                shards: vec![ComputeHookNotifyRequestShard {
                    shard_number: ShardNumber(0),
@@ -158,7 +178,9 @@ impl ComputeHookTenant {
                );
                None
            }
-        }
+        };
+
+        request.map(|r| (r, self.get_lock().clone().lock_owned()))
    }
 }

@@ -167,8 +189,11 @@ impl ComputeHookTenant {
 /// the compute connection string.
 pub(super) struct ComputeHook {
    config: Config,
-    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
    authorization_header: Option<String>,
+
+    // This lock is only used in testing enviroments, to serialize calls into neon_lock
+    neon_local_lock: tokio::sync::Mutex<()>,
 }

 impl ComputeHook {
@@ -182,6 +207,7 @@ impl ComputeHook {
            state: Default::default(),
            config,
            authorization_header,
+            neon_local_lock: Default::default(),
        }
    }

@@ -190,6 +216,10 @@ impl ComputeHook {
        &self,
        reconfigure_request: ComputeHookNotifyRequest,
    ) -> anyhow::Result<()> {
+        // neon_local updates are not safe to call concurrently, use a lock to serialize
+        // all calls to this function
+        let _locked = self.neon_local_lock.lock().await;
+
        let env = match LocalEnv::load_config() {
            Ok(e) => e,
            Err(e) => {
@@ -340,30 +370,38 @@ impl ComputeHook {
        stripe_size: ShardStripeSize,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let mut locked = self.state.lock().await;
+        let reconfigure_request = {
+            let mut locked = self.state.lock().unwrap();

-        use std::collections::hash_map::Entry;
-        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
-            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                tenant_shard_id,
-                stripe_size,
-                node_id,
-            )),
-            Entry::Occupied(e) => {
-                let tenant = e.into_mut();
-                tenant.update(tenant_shard_id, stripe_size, node_id);
-                tenant
-            }
+            use std::collections::hash_map::Entry;
+            let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                    tenant_shard_id,
+                    stripe_size,
+                    node_id,
+                )),
+                Entry::Occupied(e) => {
+                    let tenant = e.into_mut();
+                    tenant.update(tenant_shard_id, stripe_size, node_id);
+                    tenant
+                }
+            };
+
+            tenant.maybe_reconfigure(tenant_shard_id.tenant_id)
        };
-
-        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
-        let Some(reconfigure_request) = reconfigure_request else {
+        let Some((reconfigure_request, lock_fut)) = reconfigure_request else {
            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
            // until it does.
            tracing::info!("Tenant isn't yet ready to emit a notification");
            return Ok(());
        };

+        // Finish acquiring the tenant's async lock: this future was created inside the self.state
+        // lock above, so we are guaranteed to get this lock in the same order as callers took
+        // that lock.  This ordering is essential: the cloud control plane must end up with the
+        // same end state for the tenant that we see.
+        let _guard = lock_fut.await;
+
        if let Some(notify_url) = &self.config.compute_hook_url {
            self.do_notify(notify_url, reconfigure_request, cancel)
                .await
@@ -405,6 +443,7 @@ pub(crate) mod tests {
            tenant_state
                .maybe_reconfigure(tenant_id)
                .unwrap()
+                .0
                .shards
                .len(),
            1
@@ -412,6 +451,7 @@ pub(crate) mod tests {
        assert!(tenant_state
            .maybe_reconfigure(tenant_id)
            .unwrap()
+            .0
            .stripe_size
            .is_none());

@@ -445,6 +485,7 @@ pub(crate) mod tests {
            tenant_state
                .maybe_reconfigure(tenant_id)
                .unwrap()
+                .0
                .shards
                .len(),
            2
@@ -453,6 +494,7 @@ pub(crate) mod tests {
            tenant_state
                .maybe_reconfigure(tenant_id)
                .unwrap()
+                .0
                .stripe_size,
            Some(ShardStripeSize(32768))
        );
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -10,7 +10,9 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
+use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
+use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
 use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
@@ -438,6 +440,24 @@ async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiEr
    state.service.tenants_dump()
 }

+async fn handle_balance_all(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+    service.balance_all()?;
+    json_response(StatusCode::OK, ())
+}
+
+async fn handle_balance_attached(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+    service.balance_attached()?;
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -554,6 +574,9 @@ pub fn make_router(
        .post("/debug/v1/consistency_check", |r| {
            request_span(r, handle_consistency_check)
        })
+        .put("/debug/v1/failpoints", |r| {
+            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
+        })
        .get("/control/v1/tenant/:tenant_id/locate", |r| {
            tenant_service_handler(r, handle_tenant_locate)
        })
@@ -572,6 +595,12 @@ pub fn make_router(
        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
            tenant_service_handler(r, handle_tenant_shard_split)
        })
+        .post("/control/v1/balance/all", |r| {
+            tenant_service_handler(r, handle_balance_all)
+        })
+        .post("/control/v1/balance/attached", |r| {
+            tenant_service_handler(r, handle_balance_attached)
+        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
--- a/control_plane/attachment_service/src/id_lock_map.rs
+++ b/control_plane/attachment_service/src/id_lock_map.rs
@@ -0,0 +1,54 @@
+use std::{collections::HashMap, sync::Arc};
+
+/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
+/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
+/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
+/// is needed at a tenant-wide granularity.
+pub(crate) struct IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    /// A synchronous lock for getting/setting the async locks that our callers will wait on.
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+}
+
+impl<T> IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    pub(crate) fn shared(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().read_owned()
+    }
+
+    pub(crate) fn exclusive(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().write_owned()
+    }
+
+    /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
+    /// periodic housekeeping to avoid the map growing indefinitely
+    pub(crate) fn housekeeping(&self) {
+        let mut locked = self.entities.lock().unwrap();
+        locked.retain(|_k, lock| lock.try_write().is_err())
+    }
+}
+
+impl<T> Default for IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    fn default() -> Self {
+        Self {
+            entities: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod compute_hook;
 pub mod http;
+mod id_lock_map;
 pub mod metrics;
 mod node;
 pub mod persistence;
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -11,6 +11,9 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::ShardConfigError;
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
@@ -72,6 +75,14 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

+#[must_use]
+pub(crate) enum AbortShardSplitStatus {
+    /// We aborted the split in the database by reverting to the parent shards
+    Aborted,
+    /// The split had already been persisted.
+    Complete,
+}
+
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

 impl Persistence {
@@ -570,6 +581,42 @@ impl Persistence {
        })
        .await
    }
+
+    /// Used when the remote part of a shard split failed: we will revert the database state to have only
+    /// the parent shards, with SplitState::Idle.
+    pub(crate) async fn abort_shard_split(
+        &self,
+        split_tenant_id: TenantId,
+        new_shard_count: ShardCount,
+    ) -> DatabaseResult<AbortShardSplitStatus> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
+            let aborted = conn.transaction(|conn| -> QueryResult<AbortShardSplitStatus> {
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }
+
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;
+
+                Ok(AbortShardSplitStatus::Aborted)
+            })?;
+
+            Ok(aborted)
+        })
+        .await
+    }
 }

 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
@@ -604,6 +651,28 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) config: String,
 }

+impl TenantShardPersistence {
+    pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
+        if self.shard_count == 0 {
+            Ok(ShardIdentity::unsharded())
+        } else {
+            Ok(ShardIdentity::new(
+                ShardNumber(self.shard_number as u8),
+                ShardCount::new(self.shard_count as u8),
+                ShardStripeSize(self.shard_stripe_size as u32),
+            )?)
+        }
+    }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
+        Ok(TenantShardId {
+            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
+            shard_number: ShardNumber(self.shard_number as u8),
+            shard_count: ShardCount::new(self.shard_count as u8),
+        })
+    }
+}
+
 /// Parts of [`crate::node::Node`] that are stored durably
 #[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::nodes)]
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,5 +1,6 @@
 use crate::persistence::Persistence;
 use crate::service;
+use hyper::StatusCode;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
 use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};

+const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
@@ -485,17 +488,29 @@ impl Reconciler {
                )
                .await
            {
-                Some(Ok(observed)) => observed,
+                Some(Ok(observed)) => Some(observed),
+                Some(Err(mgmt_api::Error::ApiError(status, _msg)))
+                    if status == StatusCode::NOT_FOUND =>
+                {
+                    None
+                }
                Some(Err(e)) => return Err(e.into()),
                None => return Err(ReconcileError::Cancel),
            };
            tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
-            self.observed.locations.insert(
-                attached_node.get_id(),
-                ObservedStateLocation {
-                    conf: observed_conf,
-                },
-            );
+            match observed_conf {
+                Some(conf) => {
+                    // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
+                    // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
+                    self.observed
+                        .locations
+                        .insert(attached_node.get_id(), ObservedStateLocation { conf });
+                }
+                None => {
+                    // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
+                    self.observed.locations.remove(&attached_node.get_id());
+                }
+            }
        }

        Ok(())
@@ -525,7 +540,12 @@ impl Reconciler {
                )));
            };

-            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let mut wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
            match self.observed.locations.get(&node.get_id()) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
@@ -662,10 +682,26 @@ impl Reconciler {
    }
 }

+/// We tweak the externally-set TenantConfig while configuring
+/// locations, using our awareness of whether secondary locations
+/// are in use to automatically enable/disable heatmap uploads.
+fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
+    let mut config = config.clone();
+    if has_secondaries {
+        if config.heatmap_period.is_none() {
+            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
+        }
+    } else {
+        config.heatmap_period = None;
+    }
+    config
+}
+
 pub(crate) fn attached_location_conf(
    generation: Generation,
    shard: &ShardIdentity,
    config: &TenantConfig,
+    has_secondaries: bool,
 ) -> LocationConfig {
    LocationConfig {
        mode: LocationConfigMode::AttachedSingle,
@@ -674,7 +710,7 @@ pub(crate) fn attached_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, has_secondaries),
    }
 }

@@ -689,6 +725,6 @@ pub(crate) fn secondary_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: config.clone(),
+        tenant_conf: ha_aware_config(config, true),
    }
 }
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -577,7 +577,12 @@ impl TenantState {
                .generation
                .expect("Attempted to enter attached state without a generation");

-            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
+            let wanted_conf = attached_location_conf(
+                generation,
+                &self.shard,
+                &self.config,
+                !self.intent.secondary.is_empty(),
+            );
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -774,7 +774,10 @@ impl Endpoint {
            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }

-        let client = reqwest::Client::new();
+        let client = reqwest::Client::builder()
+            .timeout(Duration::from_secs(30))
+            .build()
+            .unwrap();
        let response = client
            .post(format!(
                "http://{}:{}/configure",
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,6 +17,7 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
+use hyper::StatusCode;
 use pageserver_api::controller_api::NodeRegisterRequest;
 use pageserver_api::models::{
    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
@@ -262,6 +263,11 @@ impl PageServerNode {
                match st {
                    Ok(()) => Ok(true),
                    Err(mgmt_api::Error::ReceiveBody(_)) => Ok(false),
+                    Err(mgmt_api::Error::ApiError(status, _msg))
+                        if status == StatusCode::SERVICE_UNAVAILABLE =>
+                    {
+                        Ok(false)
+                    }
                    Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
                }
            },
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2103,6 +2103,16 @@ where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
 {
+    if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
+        fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
+            "failpoint".into()
+        )));
+
+        fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
+            anyhow::anyhow!("failpoint")
+        )));
+    }
+
    // Spawn a new task to handle the request, to protect the handler from unexpected
    // async cancellations. Most pageserver functions are not async cancellation safe.
    // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
@@ -2247,7 +2257,7 @@ pub fn make_router(
        .get("/v1/location_config", |r| {
            api_handler(r, list_location_config_handler)
        })
-        .get("/v1/location_config/:tenant_id", |r| {
+        .get("/v1/location_config/:tenant_shard_id", |r| {
            api_handler(r, get_location_config_handler)
        })
        .put(
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1440,6 +1440,31 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
        new_shard_count: ShardCount,
        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<TenantShardId>> {
+        let r = self
+            .do_shard_split(tenant_shard_id, new_shard_count, ctx)
+            .await;
+        if r.is_err() {
+            // Shard splitting might have left the original shard in a partially shut down state (it
+            // stops the shard's remote timeline client).  Reset it to ensure we leave things in
+            // a working state.
+            if self.get(tenant_shard_id).is_some() {
+                tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
+                if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
+                    // Log this error because our return value will still be the original error, not this one.
+                    tracing::warn!("Failed to reset {tenant_shard_id}: {e}");
+                }
+            }
+        }
+
+        r
+    }
+
+    pub(crate) async fn do_shard_split(
+        &self,
+        tenant_shard_id: TenantShardId,
+        new_shard_count: ShardCount,
+        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<TenantShardId>> {
        let tenant = get_tenant(tenant_shard_id, true)?;

@@ -1466,6 +1491,10 @@ impl TenantManager {
                .join(",")
        );

+        fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
        // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
        if let Err(e) = tenant.split_prepare(&child_shards).await {
            // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
@@ -1475,6 +1504,10 @@ impl TenantManager {
            return Err(e);
        }

+        fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
        self.resources.deletion_queue_client.flush_advisory();

        // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
@@ -1496,11 +1529,16 @@ impl TenantManager {
                anyhow::bail!("Detached parent shard in the middle of split!")
            }
        };
-
+        fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
        // Optimization: hardlink layers from the parent into the children, so that they don't have to
        // re-download & duplicate the data referenced in their initial IndexPart
        self.shard_split_hardlink(parent, child_shards.clone())
            .await?;
+        fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));

        // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
        // child shards to reach this point.
@@ -1537,6 +1575,10 @@ impl TenantManager {
            .await?;
        }

+        fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
        // Phase 4: wait for child chards WAL ingest to catch up to target LSN
        for child_shard_id in &child_shards {
            let child_shard_id = *child_shard_id;
@@ -1569,6 +1611,10 @@ impl TenantManager {
                        timeline.timeline_id,
                        target_lsn
                    );
+
+                    fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
+                        "failpoint"
+                    )));
                    if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
                        // Failure here might mean shutdown, in any case this part is an optimization
                        // and we shouldn't hold up the split operation.
@@ -1614,6 +1660,10 @@ impl TenantManager {
            },
        );

+        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
+            "failpoint"
+        )));
+
        parent_slot_guard.drop_old_value()?;

        // Phase 6: Release the InProgress on the parent shard
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1518,6 +1518,7 @@ class NeonCli(AbstractNeonCli):
        conf: Optional[Dict[str, Any]] = None,
        shard_count: Optional[int] = None,
        shard_stripe_size: Optional[int] = None,
+        placement_policy: Optional[str] = None,
        set_default: bool = False,
    ) -> Tuple[TenantId, TimelineId]:
        """
@@ -1551,6 +1552,9 @@ class NeonCli(AbstractNeonCli):
        if shard_stripe_size is not None:
            args.extend(["--shard-stripe-size", str(shard_stripe_size)])

+        if placement_policy is not None:
+            args.extend(["--placement-policy", str(placement_policy)])
+
        res = self.raw_cli(args)
        res.check_returncode()
        return tenant_id, timeline_id
@@ -2168,6 +2172,37 @@ class NeonAttachmentService(MetricsGetter):
        )
        log.info("Attachment service passed consistency check")

+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.request(
+            "PUT",
+            f"{self.env.attachment_service_api}/debug/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+
+    def balance_all(self):
+        self.request(
+            "POST",
+            f"{self.env.attachment_service_api}/control/v1/balance/all",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+    def balance_attached(self):
+        self.request(
+            "POST",
+            f"{self.env.attachment_service_api}/control/v1/balance/attached",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
    def __enter__(self) -> "NeonAttachmentService":
        return self

@@ -2322,16 +2357,16 @@ class NeonPageserver(PgProtocol):
    def assert_no_errors(self):
        logfile = self.workdir / "pageserver.log"
        if not logfile.exists():
-            log.warning(f"Skipping log check: {logfile} does not exist")
+            log.warning(f"Skipping log check on pageserver {self.id}: {logfile} does not exist")
            return

        with logfile.open("r") as f:
            errors = scan_pageserver_log_for_errors(f, self.allowed_errors)

        for _lineno, error in errors:
-            log.info(f"not allowed error: {error.strip()}")
+            log.info(f"not allowed error (pageserver {self.id}): {error.strip()}")

-        assert not errors
+        assert not errors, f"Pageserver {self.id}: {errors}"

    def assert_no_metric_errors(self):
        """
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,3 +1,4 @@
+import threading
 from typing import Optional

 from fixtures.log_helper import log
@@ -11,6 +12,10 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.types import TenantId, TimelineId

+# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
+# to ensure we don't do that: this enables running lots of Workloads in parallel safely.
+ENDPOINT_LOCK = threading.Lock()
+

 class Workload:
    """
@@ -41,17 +46,30 @@ class Workload:

        self._endpoint: Optional[Endpoint] = None

+    def reconfigure(self):
+        """
+        Request the endpoint to reconfigure based on location reported by storage controller
+        """
+        if self._endpoint is not None:
+            with ENDPOINT_LOCK:
+                self._endpoint.reconfigure()
+
    def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
-        if self._endpoint is None:
-            self._endpoint = self.env.endpoints.create(
-                self.branch_name,
-                tenant_id=self.tenant_id,
-                pageserver_id=pageserver_id,
-                endpoint_id="ep-workload",
-            )
-            self._endpoint.start(pageserver_id=pageserver_id)
-        else:
-            self._endpoint.reconfigure(pageserver_id=pageserver_id)
+        # We may be running alongside other Workloads for different tenants.  Full TTID is
+        # obnoxiously long for use here, but a cut-down version is still unique enough for tests.
+        endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
+
+        with ENDPOINT_LOCK:
+            if self._endpoint is None:
+                self._endpoint = self.env.endpoints.create(
+                    self.branch_name,
+                    tenant_id=self.tenant_id,
+                    pageserver_id=pageserver_id,
+                    endpoint_id=endpoint_id,
+                )
+                self._endpoint.start(pageserver_id=pageserver_id)
+            else:
+                self._endpoint.reconfigure(pageserver_id=pageserver_id)

        connstring = self._endpoint.safe_psql(
            "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
@@ -94,7 +112,7 @@ class Workload:
        else:
            return False

-    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
+    def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True):
        assert self.expect_rows >= n

        max_iters = 10
@@ -132,22 +150,28 @@ class Workload:
                ]
            )

-        for tenant_shard_id, pageserver in tenant_get_shards(
-            self.env, self.tenant_id, pageserver_id
-        ):
-            last_flush_lsn = wait_for_last_flush_lsn(
-                self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
-            )
-            ps_http = pageserver.http_client()
-            wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+        if ingest:
+            # Wait for written data to be ingested by the pageserver
+            for tenant_shard_id, pageserver in tenant_get_shards(
+                self.env, self.tenant_id, pageserver_id
+            ):
+                last_flush_lsn = wait_for_last_flush_lsn(
+                    self.env,
+                    endpoint,
+                    self.tenant_id,
+                    self.timeline_id,
+                    pageserver_id=pageserver_id,
+                )
+                ps_http = pageserver.http_client()
+                wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)

-            if upload:
-                # force a checkpoint to trigger upload
-                ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
-                wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
-                log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
-            else:
-                log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
+                if upload:
+                    # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
+                    ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
+                    wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
+                    log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
+                else:
+                    log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")

    def validate(self, pageserver_id: Optional[int] = None):
        endpoint = self.endpoint(pageserver_id)
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,13 +1,17 @@
 import os
+from typing import Optional

 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    AttachmentServiceApiException,
+    NeonEnv,
    NeonEnvBuilder,
    tenant_get_shards,
 )
 from fixtures.remote_storage import s3_storage
 from fixtures.types import Lsn, TenantShardId, TimelineId
+from fixtures.utils import wait_until
 from fixtures.workload import Workload


@@ -400,3 +404,245 @@ def test_sharding_ingest(

    # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
    assert huge_layer_count <= shard_count
+
+
+class Failure:
+    pageserver_id: Optional[int]
+
+    def apply(self, env: NeonEnv):
+        raise NotImplementedError()
+
+    def clear(self, env: NeonEnv):
+        """
+        Clear the failure, in a way that should enable the system to proceed
+        to a totally clean state (all nodes online and reconciled)
+        """
+        raise NotImplementedError()
+
+    def expect_available(self):
+        raise NotImplementedError()
+
+    def can_mitigate(self):
+        """Whether Self.mitigate is available for use"""
+        return False
+
+    def mitigate(self, env: NeonEnv):
+        """
+        Mitigate the failure in a way that should allow shard split to
+        complete and service to resume, but does not guarantee to leave
+        the whole world in a clean state (e.g. an Offline node might have
+        junk LocationConfigs on it)
+        """
+        raise NotImplementedError()
+
+    def fails_forward(self):
+        """
+        If true, this failure results in a state that eventualy completes the split.
+        """
+        return False
+
+
+class PageserverFailpoint(Failure):
+    def __init__(self, failpoint, pageserver_id, mitigate):
+        self.failpoint = failpoint
+        self.pageserver_id = pageserver_id
+        self._mitigate = mitigate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.allowed_errors.extend(
+            [".*failpoint.*", ".*Resetting.*after shard split failure.*"]
+        )
+        pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.http_client().configure_failpoints((self.failpoint, "off"))
+        if self._mitigate:
+            env.attachment_service.node_configure(self.pageserver_id, {"availability": "Active"})
+
+    def expect_available(self):
+        return True
+
+    def can_mitigate(self):
+        return self._mitigate
+
+    def mitigate(self, env):
+        env.attachment_service.node_configure(self.pageserver_id, {"availability": "Offline"})
+
+
+class StorageControllerFailpoint(Failure):
+    def __init__(self, failpoint):
+        self.failpoint = failpoint
+        self.pageserver_id = None
+
+    def apply(self, env: NeonEnv):
+        env.attachment_service.configure_failpoints((self.failpoint, "return(1)"))
+
+    def clear(self, env: NeonEnv):
+        env.attachment_service.configure_failpoints((self.failpoint, "off"))
+
+    def expect_available(self):
+        return True
+
+    def can_mitigate(self):
+        return False
+
+    def fails_forward(self):
+        # Edge case: the very last failpoint that simulates a DB connection error, where
+        # the abort path will fail-forward and result in a complete split.
+        return self.failpoint == "shard-split-post-complete"
+
+
+class NodeKill(Failure):
+    def __init__(self, pageserver_id, mitigate):
+        self.pageserver_id = pageserver_id
+        self._mitigate = mitigate
+
+    def apply(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.stop(immediate=True)
+
+    def clear(self, env: NeonEnv):
+        pageserver = env.get_pageserver(self.pageserver_id)
+        pageserver.start()
+
+    def expect_available(self):
+        return False
+
+    def mitigate(self, env):
+        env.attachment_service.node_configure(self.pageserver_id, {"availability": "Offline"})
+
+
+@pytest.mark.parametrize(
+    "failure",
+    [
+        PageserverFailpoint("api-500", 1, False),
+        NodeKill(1, False),
+        PageserverFailpoint("api-500", 1, True),
+        NodeKill(1, True),
+        PageserverFailpoint("shard-split-pre-prepare", 1, False),
+        PageserverFailpoint("shard-split-post-prepare", 1, False),
+        PageserverFailpoint("shard-split-pre-hardlink", 1, False),
+        PageserverFailpoint("shard-split-post-hardlink", 1, False),
+        PageserverFailpoint("shard-split-post-child-conf", 1, False),
+        PageserverFailpoint("shard-split-lsn-wait", 1, False),
+        PageserverFailpoint("shard-split-pre-finish", 1, False),
+        StorageControllerFailpoint("shard-split-validation"),
+        StorageControllerFailpoint("shard-split-post-begin"),
+        StorageControllerFailpoint("shard-split-post-remote"),
+        StorageControllerFailpoint("shard-split-post-complete"),
+    ],
+)
+def test_sharding_split_failures(neon_env_builder: NeonEnvBuilder, failure: Failure):
+    neon_env_builder.num_pageservers = 4
+    initial_shard_count = 2
+    split_shard_count = 4
+
+    env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything
+    assert (
+        failure.pageserver_id is None
+        or len(
+            env.get_pageserver(failure.pageserver_id)
+            .http_client()
+            .tenant_list_locations()["tenant_shards"]
+        )
+        > 0
+    )
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(100)
+
+    # Set one pageserver to 500 all requests, then do a split
+    # TODO: also test with a long-blocking failure: controller should time out its request and then
+    # clean up in a well defined way.
+    failure.apply(env)
+
+    with pytest.raises(AttachmentServiceApiException):
+        env.attachment_service.tenant_shard_split(tenant_id, shard_count=4)
+
+    # We expect that the overall operation will fail, but some split requests
+    # will have succeeded: the net result should be to return to a clean state, including
+    # detaching any child shards.
+    def assert_rolled_back(exclude_ps_id=None) -> None:
+        count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                assert tenant_shard_id.shard_count == initial_shard_count
+                count += 1
+        assert count == initial_shard_count
+
+    def assert_split_done(exclude_ps_id=None) -> None:
+        count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
+                assert tenant_shard_id.shard_count == split_shard_count
+                count += 1
+        assert count == split_shard_count
+
+    def finish_split():
+        # Having failed+rolled back, we should be able to split again
+        # No failures this time; it will succeed
+        env.attachment_service.tenant_shard_split(tenant_id, shard_count=split_shard_count)
+
+        workload.churn_rows(10)
+        workload.validate()
+
+    if failure.expect_available():
+        # Even though the split failed partway through, this should not have interrupted
+        # clients.  Disable waiting for pageservers in the workload helper, because our
+        # failpoints may prevent API access.
+        # This only applies for failure modes that leave pageserver page_service API available.
+        workload.churn_rows(10, upload=False, ingest=False)
+        workload.validate()
+
+    if failure.fails_forward():
+        # A failure type which results in eventual completion of the split
+        wait_until(30, 1, assert_split_done)
+    elif failure.can_mitigate():
+        # Mitigation phase: we expect to be able to proceed with a successful shard split
+        failure.mitigate(env)
+
+        # The split should appear to be rolled back from the point of view of all pageservers
+        # apart from the one that is offline
+        wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
+
+        finish_split()
+        wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
+
+        # Having cleared the failure, everything should converge to a pristine state
+        failure.clear(env)
+        wait_until(30, 1, assert_split_done)
+    else:
+        # Once we restore the faulty pageserver's API to good health, rollback should
+        # eventually complete.
+        failure.clear(env)
+
+        wait_until(30, 1, assert_rolled_back)
+
+        # Having rolled back, the tenant should be working
+        workload.churn_rows(10)
+        workload.validate()
+
+        # Splitting again should work, since we cleared the failure
+        finish_split()
+        assert_split_done()
+
+    env.attachment_service.consistency_check()
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -1,3 +1,5 @@
+import concurrent.futures
+import random
 import time
 from collections import defaultdict
 from datetime import datetime, timezone
@@ -23,8 +25,9 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import TenantId, TimelineId
+from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
+from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
    ObjectTypeDef,
 )
@@ -770,3 +773,186 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
    assert "pitr_interval" not in readback_ps.tenant_specific_overrides

    env.attachment_service.consistency_check()
+
+
+def test_storcon_rolling_failures(
+    neon_env_builder: NeonEnvBuilder, httpserver: HTTPServer, httpserver_listen_address
+):
+    neon_env_builder.num_pageservers = 8
+
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify-attach"
+
+    workloads: dict[TenantId, Workload] = {}
+
+    # Do neon_local endpoint reconfiguration in the background so that we can
+    # accept a healthy rate of calls into notify-attach.
+    reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+
+    def handler(request: Request):
+        """
+        Although storage controller can use neon_local directly, this causes problems when
+        the test is also concurrently modifying endpoints.  Instead, configure storage controller
+        to send notifications up to this test code, which will route all endpoint updates
+        through Workload, which has a mutex to make it all safe.
+        """
+        assert request.json is not None
+        body: dict[str, Any] = request.json
+        log.info(f"notify-attach request: {body}")
+
+        try:
+            workload = workloads[TenantId(body["tenant_id"])]
+        except KeyError:
+            pass
+        else:
+            # This causes the endpoint to query storage controller for its location, which
+            # is redundant since we already have it here, but this avoids extending the
+            # neon_local CLI to take full lists of locations
+            reconfigure_threads.submit(lambda workload=workload: workload.reconfigure())  # type: ignore[no-any-return]
+
+        return Response(status=200)
+
+    httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
+
+    env = neon_env_builder.init_start()
+
+    for ps in env.pageservers:
+        # We will do unclean detaches
+        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+    n_tenants = 32
+    tenants = [(env.initial_tenant, env.initial_timeline)]
+    for i in range(0, n_tenants - 1):
+        tenant_id = TenantId.generate()
+        timeline_id = TimelineId.generate()
+        shard_count = [1, 2, 4][i % 3]
+        env.neon_cli.create_tenant(
+            tenant_id, timeline_id, shard_count=shard_count, placement_policy='{"Double":1}'
+        )
+        tenants.append((tenant_id, timeline_id))
+
+    # Background pain:
+    # - TODO: some fraction of pageserver API requests hang
+    #   (this requires implementing wrap of location_conf calls with proper timeline/cancel)
+    # - TODO: continuous tenant/timeline creation/destruction over a different ID range than
+    #   the ones we're using for availability checks.
+
+    rng = random.Random(0xDEADBEEF)
+
+    for tenant_id, timeline_id in tenants:
+        workload = Workload(env, tenant_id, timeline_id)
+        workloads[tenant_id] = workload
+
+    def node_evacuated(node_id: int):
+        counts = get_node_shard_counts(env, [t[0] for t in tenants])
+        assert counts[node_id] == 0
+
+    def attachments_active():
+        for tid, _tlid in tenants:
+            for shard in env.attachment_service.locate(tid):
+                psid = shard["node_id"]
+                tsid = TenantShardId.parse(shard["shard_id"])
+                status = env.get_pageserver(psid).http_client().tenant_status(tenant_id=tsid)
+                assert status["state"]["slug"] == "Active"
+                log.info(f"Shard {tsid} active on node {psid}")
+
+    failpoints = ("api-503", "5%1000*return(1)")
+    failpoints_str = f"{failpoints[0]}={failpoints[1]}"
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints(failpoints)
+
+    def for_all_workloads(callback, timeout=60):
+        futs = []
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            for _tenant_id, workload in workloads.items():
+                futs.append(pool.submit(callback, workload))
+
+            for f in futs:
+                f.result(timeout=timeout)
+
+    def clean_fail_restore():
+        """
+        Clean shutdown of a node: mark it offline in storage controller, wait for new attachment
+        locations to activate, then SIGTERM it.
+        - Endpoints should not fail any queries
+        - New attach locations should activate within bounded time.
+        """
+        victim = rng.choice(env.pageservers)
+        env.attachment_service.node_configure(victim.id, {"availability": "Offline"})
+
+        wait_until(10, 1, lambda node_id=victim.id: node_evacuated(node_id))  # type: ignore[misc]
+        wait_until(10, 1, attachments_active)
+
+        victim.stop(immediate=False)
+
+        traffic()
+
+        victim.start(extra_env_vars={"FAILPOINTS": failpoints_str})
+
+        # Revert shards to attach at their original locations
+        env.attachment_service.balance_attached()
+        wait_until(10, 1, attachments_active)
+
+    def hard_fail_restore():
+        """
+        Simulate an unexpected death of a pageserver node
+        """
+        victim = rng.choice(env.pageservers)
+        victim.stop(immediate=True)
+        # TODO: once we implement heartbeats detecting node failures, remove this
+        # explicit marking offline and rely on storage controller to detect it itself.
+        env.attachment_service.node_configure(victim.id, {"availability": "Offline"})
+        wait_until(10, 1, lambda node_id=victim.id: node_evacuated(node_id))  # type: ignore[misc]
+        wait_until(10, 1, attachments_active)
+        traffic()
+        victim.start(extra_env_vars={"FAILPOINTS": failpoints_str})
+        env.attachment_service.balance_attached()
+        wait_until(10, 1, attachments_active)
+
+    def traffic():
+        """
+        Check that all tenants are working for postgres clients
+        """
+
+        def exercise_one(workload):
+            workload.churn_rows(100)
+            workload.validate()
+
+        for_all_workloads(exercise_one)
+
+    def init_one(workload):
+        workload.init()
+        workload.write_rows(100)
+
+    for_all_workloads(init_one, timeout=60)
+
+    for i in range(0, 20):
+        mode = rng.choice([0, 1, 2])
+        log.info(f"Iteration {i}, mode {mode}")
+        if mode == 0:
+            # Traffic interval: sometimes, instead of a failure, just let the clients
+            # write a load of data.  This avoids chaos tests ending up with unrealistically
+            # small quantities of data in flight.
+            traffic()
+        elif mode == 1:
+            clean_fail_restore()
+        elif mode == 2:
+            hard_fail_restore()
+
+        # Fail and restart: hard-kill one node. Notify the storage controller that it is offline.
+        # Success criteria:
+        # - New attach locations should activate within bounded time
+        # - TODO: once we do heartbeating, we should not have to explicitly mark the node offline
+
+        # TODO: fail and remove: fail a node, and remove it from the cluster.
+        # Success criteria:
+        # - Endpoints should not fail any queries
+        # - New attach locations should activate within bounded time
+        # - New secondary locations should fill up with data within bounded time
+
+        # TODO: somehow need to wait for reconciles to complete before doing consistency check
+        # (or make the check wait).
+
+        # Do consistency check on every iteration, not just at the end: this makes it more obvious
+        # which change caused an issue.
+        env.attachment_service.consistency_check()
Author	SHA1	Message	Date
John Spray	151815c2d3	tests: failure cases for shard splitting	2024-03-08 19:16:12 +00:00
John Spray	b04cf3ea8b	neon_local: timeout on endpoint reconfigures	2024-03-08 19:16:12 +00:00
John Spray	3d228968ef	control_plane: failpoints HTTP endpoint	2024-03-08 19:16:12 +00:00
John Spray	f812904eff	pageserver: add shard splitting failpoints	2024-03-08 19:16:12 +00:00
John Spray	8a3b1e0876	storage controller: reconcile nodes on activation	2024-03-08 19:16:12 +00:00
John Spray	ef994de66b	storage controller: add node locks	2024-03-08 19:16:12 +00:00
John Spray	665cb8c398	Reconciler: fix maybe_refresh_observed for 404 case	2024-03-08 19:16:12 +00:00
John Spray	1f9e32734e	pageserver: fix location_config getter	2024-03-08 19:16:12 +00:00
John Spray	0f0606da3c	pageserver: fix api-5xx failpoints	2024-03-08 19:16:12 +00:00
John Spray	9bdbe2d630	storage controller: error handling for shard split	2024-03-08 19:16:12 +00:00
John Spray	acd397a5f4	storage controller: add machinery for per-tenant locks	2024-03-08 17:18:06 +00:00
John Spray	25bd74fd6a	tests: controller rolling failure test	2024-03-08 17:18:06 +00:00
John Spray	223810fd79	control_plane: revise compute_hook locking (don't serialize all calls)	2024-03-08 17:18:06 +00:00
John Spray	2baac6f6e6	neon_local: handle 503 in pageserver /status	2024-03-08 17:18:06 +00:00
John Spray	27815678e7	control_plane: basic balancer API	2024-03-08 17:18:06 +00:00
John Spray	59523444cc	pageserver: add generic failpoint for HTTP API 503	2024-03-08 17:18:06 +00:00
John Spray	7a1f83854f	control_plane: enable heatmap uploads for shards with secondary locations	2024-03-08 17:18:06 +00:00
John Spray	da22557383	pageserver: fix leaving tenant in bad state on split failure	2024-03-08 17:18:06 +00:00
John Spray	7ee3b59ba8	tests: mention pageserver Id in log complaints	2024-03-08 17:18:06 +00:00