From 44f42627dd32f29a3be9cfcd2d8c487c89642dc8 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 14 Mar 2024 09:11:57 +0000 Subject: [PATCH] pageserver/controller: error handling for shard splitting (#7074) ## Problem Shard splits worked, but weren't safe against failures (e.g. node crash during split) yet. Related: #6676 ## Summary of changes - Introduce async rwlocks at the scope of Tenant and Node: - exclusive tenant lock is used to protect splits - exclusive node lock is used to protect new reconciliation process that happens when setting node active - exclusive locks used in both cases when doing persistent updates (e.g. node scheduling conf) where the update to DB & in-memory state needs to be atomic. - Add failpoints to shard splitting in control plane and pageserver code. - Implement error handling in control plane for shard splits: this detaches child chards and ensures parent shards are re-attached. - Crash-safety for storage controller restarts requires little effort: we already reconcile with nodes over a storage controller restart, so as long as we reset any incomplete splits in the DB on restart (added in this PR), things are implicitly cleaned up. - Implement reconciliation with offline nodes before they transition to active: - (in this context reconciliation means something like startup_reconcile, not literally the Reconciler) - This covers cases where split abort cannot reach a node to clean it up: the cleanup will eventually happen when the node is marked active, as part of reconciliation. - This also covers the case where a node was unavailable when the storage controller started, but becomes available later: previously this allowed it to skip the startup reconcile. - Storage controller now terminates on panics. We only use panics for true "should never happen" assertions, and these cases can leave us in an un-usable state if we keep running (e.g. panicking in a shard split). In the unlikely event that we get into a crashloop as a result, we'll rely on kubernetes to back us off. - Add `test_sharding_split_failures` which exercises a variety of failure cases during shard split. --- Cargo.lock | 2 + control_plane/attachment_service/Cargo.toml | 2 + control_plane/attachment_service/src/http.rs | 5 + .../attachment_service/src/id_lock_map.rs | 54 ++ control_plane/attachment_service/src/lib.rs | 1 + control_plane/attachment_service/src/main.rs | 6 + control_plane/attachment_service/src/node.rs | 35 +- .../attachment_service/src/persistence.rs | 78 ++ .../attachment_service/src/reconciler.rs | 56 +- .../attachment_service/src/service.rs | 787 +++++++++++++++--- .../attachment_service/src/tenant_state.rs | 7 +- pageserver/src/http/routes.rs | 10 + pageserver/src/tenant/mgr.rs | 56 +- test_runner/conftest.py | 1 + test_runner/fixtures/compute_reconfigure.py | 62 ++ test_runner/fixtures/neon_fixtures.py | 17 + test_runner/fixtures/workload.py | 76 +- test_runner/regress/test_sharding.py | 340 +++++++- 18 files changed, 1445 insertions(+), 150 deletions(-) create mode 100644 control_plane/attachment_service/src/id_lock_map.rs create mode 100644 test_runner/fixtures/compute_reconfigure.py diff --git a/Cargo.lock b/Cargo.lock index 7fd9053f62..45397eb4a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -282,8 +282,10 @@ dependencies = [ "control_plane", "diesel", "diesel_migrations", + "fail", "futures", "git-version", + "hex", "humantime", "hyper", "metrics", diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml index a5fad7216c..f78f56c480 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/control_plane/attachment_service/Cargo.toml @@ -19,8 +19,10 @@ aws-config.workspace = true aws-sdk-secretsmanager.workspace = true camino.workspace = true clap.workspace = true +fail.workspace = true futures.workspace = true git-version.workspace = true +hex.workspace = true hyper.workspace = true humantime.workspace = true once_cell.workspace = true diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index 515c287ea9..d26652cc94 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -10,7 +10,9 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use std::sync::Arc; use std::time::{Duration, Instant}; +use tokio_util::sync::CancellationToken; use utils::auth::{Scope, SwappableJwtAuth}; +use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::{auth_middleware, check_permission_with, request_span}; use utils::http::request::{must_get_query_param, parse_request_param}; use utils::id::{TenantId, TimelineId}; @@ -554,6 +556,9 @@ pub fn make_router( .post("/debug/v1/consistency_check", |r| { request_span(r, handle_consistency_check) }) + .put("/debug/v1/failpoints", |r| { + request_span(r, |r| failpoints_handler(r, CancellationToken::new())) + }) .get("/control/v1/tenant/:tenant_id/locate", |r| { tenant_service_handler(r, handle_tenant_locate) }) diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/control_plane/attachment_service/src/id_lock_map.rs new file mode 100644 index 0000000000..b03700b50c --- /dev/null +++ b/control_plane/attachment_service/src/id_lock_map.rs @@ -0,0 +1,54 @@ +use std::{collections::HashMap, sync::Arc}; + +/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't +/// want to embed a lock in each one, or if your locking granularity is different to your object granularity. +/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking +/// is needed at a tenant-wide granularity. +pub(crate) struct IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + /// A synchronous lock for getting/setting the async locks that our callers will wait on. + entities: std::sync::Mutex>>>, +} + +impl IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + pub(crate) fn shared( + &self, + key: T, + ) -> impl std::future::Future> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default(); + entry.clone().read_owned() + } + + pub(crate) fn exclusive( + &self, + key: T, + ) -> impl std::future::Future> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default(); + entry.clone().write_owned() + } + + /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do + /// periodic housekeeping to avoid the map growing indefinitely + pub(crate) fn housekeeping(&self) { + let mut locked = self.entities.lock().unwrap(); + locked.retain(|_k, lock| lock.try_write().is_err()) + } +} + +impl Default for IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + fn default() -> Self { + Self { + entities: std::sync::Mutex::new(HashMap::new()), + } + } +} diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs index 796b465c10..a017bc1ecc 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/control_plane/attachment_service/src/lib.rs @@ -4,6 +4,7 @@ use utils::seqwait::MonotonicCounter; mod auth; mod compute_hook; pub mod http; +mod id_lock_map; pub mod metrics; mod node; pub mod persistence; diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs index 333c3911e3..fb7b363c39 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/control_plane/attachment_service/src/main.rs @@ -206,6 +206,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> { } fn main() -> anyhow::Result<()> { + let default_panic = std::panic::take_hook(); + std::panic::set_hook(Box::new(move |info| { + default_panic(info); + std::process::exit(1); + })); + tokio::runtime::Builder::new_current_thread() // We use spawn_blocking for database operations, so require approximately // as many blocking threads as we will open database connections. diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs index 27b03608fa..dda8a155c6 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/control_plane/attachment_service/src/node.rs @@ -83,29 +83,38 @@ impl Node { } } - pub(crate) fn set_availability( - &mut self, - availability: NodeAvailability, - ) -> AvailabilityTransition { - use NodeAvailability::*; - let transition = match (self.availability, availability) { - (Offline, Active) => { + pub(crate) fn set_availability(&mut self, availability: NodeAvailability) { + match self.get_availability_transition(availability) { + AvailabilityTransition::ToActive => { // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any // users of previously-cloned copies of the node will still see the old cancellation // state. For example, Reconcilers in flight will have to complete and be spawned // again to realize that the node has become available. self.cancel = CancellationToken::new(); - AvailabilityTransition::ToActive } - (Active, Offline) => { + AvailabilityTransition::ToOffline => { // Fire the node's cancellation token to cancel any in-flight API requests to it self.cancel.cancel(); - AvailabilityTransition::ToOffline } - _ => AvailabilityTransition::Unchanged, - }; + AvailabilityTransition::Unchanged => {} + } self.availability = availability; - transition + } + + /// Without modifying the availability of the node, convert the intended availability + /// into a description of the transition. + pub(crate) fn get_availability_transition( + &self, + availability: NodeAvailability, + ) -> AvailabilityTransition { + use AvailabilityTransition::*; + use NodeAvailability::*; + + match (self.availability, availability) { + (Offline, Active) => ToActive, + (Active, Offline) => ToOffline, + _ => Unchanged, + } } /// Whether we may send API requests to this node. diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs index aa08945834..3602cf8b1f 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/control_plane/attachment_service/src/persistence.rs @@ -11,6 +11,9 @@ use diesel::prelude::*; use diesel::Connection; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; use pageserver_api::models::TenantConfig; +use pageserver_api::shard::ShardConfigError; +use pageserver_api::shard::ShardIdentity; +use pageserver_api::shard::ShardStripeSize; use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; use serde::{Deserialize, Serialize}; use utils::generation::Generation; @@ -72,6 +75,14 @@ pub(crate) enum DatabaseError { Logical(String), } +#[must_use] +pub(crate) enum AbortShardSplitStatus { + /// We aborted the split in the database by reverting to the parent shards + Aborted, + /// The split had already been persisted. + Complete, +} + pub(crate) type DatabaseResult = Result; impl Persistence { @@ -570,6 +581,51 @@ impl Persistence { }) .await } + + /// Used when the remote part of a shard split failed: we will revert the database state to have only + /// the parent shards, with SplitState::Idle. + pub(crate) async fn abort_shard_split( + &self, + split_tenant_id: TenantId, + new_shard_count: ShardCount, + ) -> DatabaseResult { + use crate::schema::tenant_shards::dsl::*; + self.with_conn(move |conn| -> DatabaseResult { + let aborted = conn.transaction(|conn| -> DatabaseResult { + // Clear the splitting state on parent shards + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.ne(new_shard_count.literal() as i32)) + .set((splitting.eq(0),)) + .execute(conn)?; + + // Parent shards are already gone: we cannot abort. + if updated == 0 { + return Ok(AbortShardSplitStatus::Complete); + } + + // Sanity check: if parent shards were present, their cardinality should + // be less than the number of child shards. + if updated >= new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected parent shard count {updated} while aborting split to \ + count {new_shard_count:?} on tenant {split_tenant_id}" + ))); + } + + // Erase child shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) + .execute(conn)?; + + Ok(AbortShardSplitStatus::Aborted) + })?; + + Ok(aborted) + }) + .await + } } /// Parts of [`crate::tenant_state::TenantState`] that are stored durably @@ -604,6 +660,28 @@ pub(crate) struct TenantShardPersistence { pub(crate) config: String, } +impl TenantShardPersistence { + pub(crate) fn get_shard_identity(&self) -> Result { + if self.shard_count == 0 { + Ok(ShardIdentity::unsharded()) + } else { + Ok(ShardIdentity::new( + ShardNumber(self.shard_number as u8), + ShardCount::new(self.shard_count as u8), + ShardStripeSize(self.shard_stripe_size as u32), + )?) + } + } + + pub(crate) fn get_tenant_shard_id(&self) -> Result { + Ok(TenantShardId { + tenant_id: TenantId::from_str(self.tenant_id.as_str())?, + shard_number: ShardNumber(self.shard_number as u8), + shard_count: ShardCount::new(self.shard_count as u8), + }) + } +} + /// Parts of [`crate::node::Node`] that are stored durably #[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)] #[diesel(table_name = crate::schema::nodes)] diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs index 603da9bf02..7f68a65c15 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/control_plane/attachment_service/src/reconciler.rs @@ -1,5 +1,6 @@ use crate::persistence::Persistence; use crate::service; +use hyper::StatusCode; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; @@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation}; +const DEFAULT_HEATMAP_PERIOD: &str = "60s"; + /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. pub(super) struct Reconciler { @@ -485,17 +488,29 @@ impl Reconciler { ) .await { - Some(Ok(observed)) => observed, + Some(Ok(observed)) => Some(observed), + Some(Err(mgmt_api::Error::ApiError(status, _msg))) + if status == StatusCode::NOT_FOUND => + { + None + } Some(Err(e)) => return Err(e.into()), None => return Err(ReconcileError::Cancel), }; tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}"); - self.observed.locations.insert( - attached_node.get_id(), - ObservedStateLocation { - conf: observed_conf, - }, - ); + match observed_conf { + Some(conf) => { + // Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state, + // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running) + self.observed + .locations + .insert(attached_node.get_id(), ObservedStateLocation { conf }); + } + None => { + // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver. + self.observed.locations.remove(&attached_node.get_id()); + } + } } Ok(()) @@ -525,7 +540,12 @@ impl Reconciler { ))); }; - let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config); + let mut wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + !self.intent.secondary.is_empty(), + ); match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { // Nothing to do @@ -662,10 +682,26 @@ impl Reconciler { } } +/// We tweak the externally-set TenantConfig while configuring +/// locations, using our awareness of whether secondary locations +/// are in use to automatically enable/disable heatmap uploads. +fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig { + let mut config = config.clone(); + if has_secondaries { + if config.heatmap_period.is_none() { + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + } + } else { + config.heatmap_period = None; + } + config +} + pub(crate) fn attached_location_conf( generation: Generation, shard: &ShardIdentity, config: &TenantConfig, + has_secondaries: bool, ) -> LocationConfig { LocationConfig { mode: LocationConfigMode::AttachedSingle, @@ -674,7 +710,7 @@ pub(crate) fn attached_location_conf( shard_number: shard.number.0, shard_count: shard.count.literal(), shard_stripe_size: shard.stripe_size.0, - tenant_conf: config.clone(), + tenant_conf: ha_aware_config(config, has_secondaries), } } @@ -689,6 +725,6 @@ pub(crate) fn secondary_location_conf( shard_number: shard.number.0, shard_count: shard.count.literal(), shard_stripe_size: shard.stripe_size.0, - tenant_conf: config.clone(), + tenant_conf: ha_aware_config(config, true), } } diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index 1c4ede3d9d..1b85081666 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -7,6 +7,7 @@ use std::{ time::{Duration, Instant}, }; +use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus}; use anyhow::Context; use control_plane::storage_controller::{ AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, @@ -36,6 +37,7 @@ use pageserver_api::{ }, }; use pageserver_client::mgmt_api; +use tokio::sync::OwnedRwLockWriteGuard; use tokio_util::sync::CancellationToken; use tracing::instrument; use utils::{ @@ -147,6 +149,18 @@ pub struct Service { compute_hook: Arc, result_tx: tokio::sync::mpsc::UnboundedSender, + // Channel for background cleanup from failed operations that require cleanup, such as shard split + abort_tx: tokio::sync::mpsc::UnboundedSender, + + // Locking on a tenant granularity (covers all shards in the tenant): + // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split) + // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD) + tenant_op_locks: IdLockMap, + + // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or + // that transition it to/from Active. + node_op_locks: IdLockMap, + // Process shutdown will fire this token cancel: CancellationToken, @@ -174,6 +188,27 @@ enum TenantCreateOrUpdate { Update(Vec), } +/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes +/// might not be available. We therefore use a queue of abort operations processed in the background. +struct TenantShardSplitAbort { + tenant_id: TenantId, + /// The target values from the request that failed + new_shard_count: ShardCount, + new_stripe_size: Option, + /// Until this abort op is complete, no other operations may be done on the tenant + _tenant_lock: tokio::sync::OwnedRwLockWriteGuard<()>, +} + +#[derive(thiserror::Error, Debug)] +enum TenantShardSplitAbortError { + #[error(transparent)] + Database(#[from] DatabaseError), + #[error(transparent)] + Remote(#[from] mgmt_api::Error), + #[error("Unavailable")] + Unavailable, +} + struct ShardUpdate { tenant_shard_id: TenantShardId, placement_policy: PlacementPolicy, @@ -627,8 +662,52 @@ impl Service { } } + async fn process_aborts( + &self, + mut abort_rx: tokio::sync::mpsc::UnboundedReceiver, + ) { + loop { + // Wait for the next result, or for cancellation + let op = tokio::select! { + r = abort_rx.recv() => { + match r { + Some(op) => {op}, + None => {break;} + } + } + _ = self.cancel.cancelled() => { + break; + } + }; + + // Retry until shutdown: we must keep this request object alive until it is properly + // processed, as it holds a lock guard that prevents other operations trying to do things + // to the tenant while it is in a weird part-split state. + while !self.cancel.is_cancelled() { + match self.abort_tenant_shard_split(&op).await { + Ok(_) => break, + Err(e) => { + tracing::warn!( + "Failed to abort shard split on {}, will retry: {e}", + op.tenant_id + ); + + // If a node is unavailable, we hope that it has been properly marked Offline + // when we retry, so that the abort op will succeed. If the abort op is failing + // for some other reason, we will keep retrying forever, or until a human notices + // and does something about it (either fixing a pageserver or restarting the controller). + tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled()) + .await + .ok(); + } + } + } + } + } + pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); + let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel(); tracing::info!("Loading nodes from database..."); let nodes = persistence @@ -641,12 +720,62 @@ impl Service { tracing::info!("Loaded {} nodes from database.", nodes.len()); tracing::info!("Loading shards from database..."); - let tenant_shard_persistence = persistence.list_tenant_shards().await?; + let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; tracing::info!( "Loaded {} shards from database.", tenant_shard_persistence.len() ); + // If any shard splits were in progress, reset the database state to abort them + let mut tenant_shard_count_min_max: HashMap = + HashMap::new(); + for tsp in &mut tenant_shard_persistence { + let shard = tsp.get_shard_identity()?; + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let entry = tenant_shard_count_min_max + .entry(tenant_shard_id.tenant_id) + .or_insert_with(|| (shard.count, shard.count)); + entry.0 = std::cmp::min(entry.0, shard.count); + entry.1 = std::cmp::max(entry.1, shard.count); + } + + for (tenant_id, (count_min, count_max)) in tenant_shard_count_min_max { + if count_min != count_max { + // Aborting the split in the database and dropping the child shards is sufficient: the reconciliation in + // [`Self::startup_reconcile`] will implicitly drop the child shards on remote pageservers, or they'll + // be dropped later in [`Self::node_activate_reconcile`] if it isn't available right now. + tracing::info!("Aborting shard split {tenant_id} {count_min:?} -> {count_max:?}"); + let abort_status = persistence.abort_shard_split(tenant_id, count_max).await?; + + // We may never see the Complete status here: if the split was complete, we wouldn't have + // identified this tenant has having mismatching min/max counts. + assert!(matches!(abort_status, AbortShardSplitStatus::Aborted)); + + // Clear the splitting status in-memory, to reflect that we just aborted in the database + tenant_shard_persistence.iter_mut().for_each(|tsp| { + // Set idle split state on those shards that we will retain. + let tsp_tenant_id = TenantId::from_str(tsp.tenant_id.as_str()).unwrap(); + if tsp_tenant_id == tenant_id + && tsp.get_shard_identity().unwrap().count == count_min + { + tsp.splitting = SplitState::Idle; + } else if tsp_tenant_id == tenant_id { + // Leave the splitting state on the child shards: this will be used next to + // drop them. + tracing::info!( + "Shard {tsp_tenant_id} will be dropped after shard split abort", + ); + } + }); + + // Drop shards for this tenant which we didn't just mark idle (i.e. child shards of the aborted split) + tenant_shard_persistence.retain(|tsp| { + TenantId::from_str(tsp.tenant_id.as_str()).unwrap() != tenant_id + || tsp.splitting == SplitState::Idle + }); + } + } + let mut tenants = BTreeMap::new(); let mut scheduler = Scheduler::new(nodes.values()); @@ -676,21 +805,8 @@ impl Service { } } for tsp in tenant_shard_persistence { - let tenant_shard_id = TenantShardId { - tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, - shard_number: ShardNumber(tsp.shard_number as u8), - shard_count: ShardCount::new(tsp.shard_count as u8), - }; - let shard_identity = if tsp.shard_count == 0 { - ShardIdentity::unsharded() - } else { - ShardIdentity::new( - ShardNumber(tsp.shard_number as u8), - ShardCount::new(tsp.shard_count as u8), - ShardStripeSize(tsp.shard_stripe_size as u32), - )? - }; - + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let shard_identity = tsp.get_shard_identity()?; // We will populate intent properly later in [`Self::startup_reconcile`], initially populate // it with what we can infer: the node for which a generation was most recently issued. let mut intent = IntentState::new(); @@ -728,9 +844,12 @@ impl Service { persistence, compute_hook: Arc::new(ComputeHook::new(config)), result_tx, + abort_tx, startup_complete: startup_complete.clone(), cancel: CancellationToken::new(), gate: Gate::default(), + tenant_op_locks: Default::default(), + node_op_locks: Default::default(), }); let result_task_this = this.clone(); @@ -741,6 +860,33 @@ impl Service { } }); + tokio::task::spawn({ + let this = this.clone(); + async move { + // Block shutdown until we're done (we must respect self.cancel) + if let Ok(_gate) = this.gate.enter() { + this.process_aborts(abort_rx).await + } + } + }); + + tokio::task::spawn({ + let this = this.clone(); + async move { + if let Ok(_gate) = this.gate.enter() { + loop { + tokio::select! { + _ = this.cancel.cancelled() => { + break; + }, + _ = tokio::time::sleep(Duration::from_secs(60)) => {} + }; + this.tenant_op_locks.housekeeping(); + } + } + } + }); + tokio::task::spawn({ let this = this.clone(); // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`] @@ -889,6 +1035,7 @@ impl Service { tenant_state.generation.unwrap(), &tenant_state.shard, &tenant_state.config, + false, )), }, )]); @@ -918,6 +1065,118 @@ impl Service { } } + // When the availability state of a node transitions to active, we must do a full reconciliation + // of LocationConfigs on that node. This is because while a node was offline: + // - we might have proceeded through startup_reconcile without checking for extraneous LocationConfigs on this node + // - aborting a tenant shard split might have left rogue child shards behind on this node. + // + // This function must complete _before_ setting a `Node` to Active: once it is set to Active, other + // Reconcilers might communicate with the node, and these must not overlap with the work we do in + // this function. + // + // The reconciliation logic in here is very similar to what [`Self::startup_reconcile`] does, but + // for written for a single node rather than as a batch job for all nodes. + #[tracing::instrument(skip_all, fields(node_id=%node.get_id()))] + async fn node_activate_reconcile( + &self, + mut node: Node, + _lock: &OwnedRwLockWriteGuard<()>, + ) -> Result<(), ApiError> { + // This Node is a mutable local copy: we will set it active so that we can use its + // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated + // later. + node.set_availability(NodeAvailability::Active); + + let configs = match node + .with_client_retries( + |client| async move { client.list_location_config().await }, + &self.config.jwt_token, + 1, + 5, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + None => { + // We're shutting down (the Node's cancellation token can't have fired, because + // we're the only scope that has a reference to it, and we didn't fire it). + return Err(ApiError::ShuttingDown); + } + Some(Err(e)) => { + // This node didn't succeed listing its locations: it may not proceed to active state + // as it is apparently unavailable. + return Err(ApiError::PreconditionFailed( + format!("Failed to query node location configs, cannot activate ({e})").into(), + )); + } + Some(Ok(configs)) => configs, + }; + tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); + + let mut cleanup = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + + for (tenant_shard_id, observed_loc) in configs.tenant_shards { + let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else { + cleanup.push(tenant_shard_id); + continue; + }; + tenant_state + .observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); + } + } + + for tenant_shard_id in cleanup { + tracing::info!("Detaching {tenant_shard_id}"); + match node + .with_client_retries( + |client| async move { + let config = LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: tenant_shard_id.shard_number.0, + shard_count: tenant_shard_id.shard_count.literal(), + shard_stripe_size: 0, + tenant_conf: models::TenantConfig::default(), + }; + client + .location_config(tenant_shard_id, config, None, false) + .await + }, + &self.config.jwt_token, + 1, + 5, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + None => { + // We're shutting down (the Node's cancellation token can't have fired, because + // we're the only scope that has a reference to it, and we didn't fire it). + return Err(ApiError::ShuttingDown); + } + Some(Err(e)) => { + // Do not let the node proceed to Active state if it is not responsive to requests + // to detach. This could happen if e.g. a shutdown bug in the pageserver is preventing + // detach completing: we should not let this node back into the set of nodes considered + // okay for scheduling. + return Err(ApiError::Conflict(format!( + "Node {node} failed to detach {tenant_shard_id}: {e}" + ))); + } + Some(Ok(_)) => {} + }; + } + + Ok(()) + } + pub(crate) async fn re_attach( &self, reattach_req: ReAttachRequest, @@ -926,15 +1185,6 @@ impl Service { self.node_register(register_req).await?; } - // Take a re-attach as indication that the node is available: this is a precursor to proper - // heartbeating in https://github.com/neondatabase/neon/issues/6844 - self.node_configure(NodeConfigureRequest { - node_id: reattach_req.node_id, - availability: Some(NodeAvailability::Active), - scheduling: None, - }) - .await?; - // Ordering: we must persist generation number updates before making them visible in the in-memory state let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?; @@ -946,6 +1196,7 @@ impl Service { // Apply the updated generation to our in-memory state let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); let mut response = ReAttachResponse { tenants: Vec::new(), @@ -957,7 +1208,7 @@ impl Service { gen: new_gen.into().unwrap(), }); // Apply the new generation number to our in-memory state - let shard_state = locked.tenants.get_mut(&tenant_shard_id); + let shard_state = tenants.get_mut(&tenant_shard_id); let Some(shard_state) = shard_state else { // Not fatal. This edge case requires a re-attach to happen // between inserting a new tenant shard in to the database, and updating our in-memory @@ -1008,6 +1259,25 @@ impl Service { // request in flight over the network: TODO handle that by making location_conf API refuse // to go backward in generations. } + + // We consider a node Active once we have composed a re-attach response, but we + // do not call [`Self::node_activate_reconcile`]: the handling of the re-attach response + // implicitly synchronizes the LocationConfigs on the node. + // + // Setting a node active unblocks any Reconcilers that might write to the location config API, + // but those requests will not be accepted by the node until it has finished processing + // the re-attach response. + if let Some(node) = nodes.get(&reattach_req.node_id) { + if !node.is_available() { + let mut new_nodes = (**nodes).clone(); + if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { + node.set_availability(NodeAvailability::Active); + } + let new_nodes = Arc::new(new_nodes); + *nodes = new_nodes; + } + } + Ok(response) } @@ -1048,6 +1318,12 @@ impl Service { &self, create_req: TenantCreateRequest, ) -> Result { + // Exclude any concurrent attempts to create/access the same tenant ID + let _tenant_lock = self + .tenant_op_locks + .exclusive(create_req.new_tenant_id.tenant_id) + .await; + let (response, waiters) = self.do_tenant_create(create_req).await?; self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?; @@ -1362,16 +1638,20 @@ impl Service { tenant_shard_id: TenantShardId, req: TenantLocationConfigRequest, ) -> Result { + // We require an exclusive lock, because we are updating both persistent and in-memory state + let _tenant_lock = self + .tenant_op_locks + .exclusive(tenant_shard_id.tenant_id) + .await; + if !tenant_shard_id.is_unsharded() { return Err(ApiError::BadRequest(anyhow::anyhow!( "This API is for importing single-sharded or unsharded tenants" ))); } - let tenant_id = tenant_shard_id.tenant_id; - // First check if this is a creation or an update - let create_or_update = self.tenant_location_config_prepare(tenant_id, req); + let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req); let mut result = TenantLocationConfigResponse { shards: Vec::new(), @@ -1477,6 +1757,9 @@ impl Service { } pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { + // We require an exclusive lock, because we are updating persistent and in-memory state + let _tenant_lock = self.tenant_op_locks.exclusive(req.tenant_id).await; + let tenant_id = req.tenant_id; let config = req.config; @@ -1558,6 +1841,8 @@ impl Service { timestamp: Cow<'_, str>, done_if_after: Cow<'_, str>, ) -> Result<(), ApiError> { + let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + let node = { let locked = self.inner.read().unwrap(); // Just a sanity check to prevent misuse: the API expects that the tenant is fully @@ -1643,6 +1928,8 @@ impl Service { &self, tenant_id: TenantId, ) -> Result<(), ApiError> { + let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to let targets = { let locked = self.inner.read().unwrap(); @@ -1692,6 +1979,8 @@ impl Service { } pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result { + let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + self.ensure_attached_wait(tenant_id).await?; // TODO: refactor into helper @@ -1788,10 +2077,10 @@ impl Service { create_req.new_timeline_id, ); + let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; + self.ensure_attached_wait(tenant_id).await?; - // TODO: refuse to do this if shard splitting is in progress - // (https://github.com/neondatabase/neon/issues/6676) let mut targets = { let locked = self.inner.read().unwrap(); let mut targets = Vec::new(); @@ -1913,11 +2202,10 @@ impl Service { timeline_id: TimelineId, ) -> Result { tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,); + let _tenant_lock = self.tenant_op_locks.shared(tenant_id).await; self.ensure_attached_wait(tenant_id).await?; - // TODO: refuse to do this if shard splitting is in progress - // (https://github.com/neondatabase/neon/issues/6676) let mut targets = { let locked = self.inner.read().unwrap(); let mut targets = Vec::new(); @@ -2106,10 +2394,306 @@ impl Service { }) } + #[instrument(skip_all, fields(tenant_id=%op.tenant_id))] + async fn abort_tenant_shard_split( + &self, + op: &TenantShardSplitAbort, + ) -> Result<(), TenantShardSplitAbortError> { + // Cleaning up a split: + // - Parent shards are not destroyed during a split, just detached. + // - Failed pageserver split API calls can leave the remote node with just the parent attached, + // just the children attached, or both. + // + // Therefore our work to do is to: + // 1. Clean up storage controller's internal state to just refer to parents, no children + // 2. Call out to pageservers to ensure that children are detached + // 3. Call out to pageservers to ensure that parents are attached. + // + // Crash safety: + // - If the storage controller stops running during this cleanup *after* clearing the splitting state + // from our database, then [`Self::startup_reconcile`] will regard child attachments as garbage + // and detach them. + // - TODO: If the storage controller stops running during this cleanup *before* clearing the splitting state + // from our database, then we will re-enter this cleanup routine on startup. + + let TenantShardSplitAbort { + tenant_id, + new_shard_count, + new_stripe_size, + .. + } = op; + + // First abort persistent state, if any exists. + match self + .persistence + .abort_shard_split(*tenant_id, *new_shard_count) + .await? + { + AbortShardSplitStatus::Aborted => { + // Proceed to roll back any child shards created on pageservers + } + AbortShardSplitStatus::Complete => { + // The split completed (we might hit that path if e.g. our database transaction + // to write the completion landed in the database, but we dropped connection + // before seeing the result). + // + // We must update in-memory state to reflect the successful split. + self.tenant_shard_split_commit_inmem( + *tenant_id, + *new_shard_count, + *new_stripe_size, + ); + return Ok(()); + } + } + + // Clean up in-memory state, and accumulate the list of child locations that need detaching + let detach_locations: Vec<(Node, TenantShardId)> = { + let mut detach_locations = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + + for (tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(op.tenant_id)) + { + if shard.shard.count == op.new_shard_count { + // Surprising: the phase of [`Self::do_tenant_shard_split`] which inserts child shards in-memory + // is infallible, so if we got an error we shouldn't have got that far. + tracing::warn!( + "During split abort, child shard {tenant_shard_id} found in-memory" + ); + continue; + } + + // Add the children of this shard to this list of things to detach + if let Some(node_id) = shard.intent.get_attached() { + for child_id in tenant_shard_id.split(*new_shard_count) { + detach_locations.push(( + nodes + .get(node_id) + .expect("Intent references nonexistent node") + .clone(), + child_id, + )); + } + } else { + tracing::warn!( + "During split abort, shard {tenant_shard_id} has no attached location" + ); + } + + tracing::info!("Restoring parent shard {tenant_shard_id}"); + shard.splitting = SplitState::Idle; + self.maybe_reconcile_shard(shard, nodes); + } + + // We don't expect any new_shard_count shards to exist here, but drop them just in case + tenants.retain(|_id, s| s.shard.count != *new_shard_count); + + detach_locations + }; + + for (node, child_id) in detach_locations { + if !node.is_available() { + // An unavailable node cannot be cleaned up now: to avoid blocking forever, we will permit this, and + // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have + // removed child shards from our in-memory state and database, the reconciliation will implicitly remove + // them from the node. + tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."); + continue; + } + + // Detach the remote child. If the pageserver split API call is still in progress, this call will get + // a 503 and retry, up to our limit. + tracing::info!("Detaching {child_id} on {node}..."); + match node + .with_client_retries( + |client| async move { + let config = LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: child_id.shard_number.0, + shard_count: child_id.shard_count.literal(), + // Stripe size and tenant config don't matter when detaching + shard_stripe_size: 0, + tenant_conf: TenantConfig::default(), + }; + + client.location_config(child_id, config, None, false).await + }, + &self.config.jwt_token, + 1, + 10, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(_)) => {} + Some(Err(e)) => { + // We failed to communicate with the remote node. This is problematic: we may be + // leaving it with a rogue child shard. + tracing::warn!( + "Failed to detach child {child_id} from node {node} during abort" + ); + return Err(e.into()); + } + None => { + // Cancellation: we were shutdown or the node went offline. Shutdown is fine, we'll + // clean up on restart. The node going offline requires a retry. + return Err(TenantShardSplitAbortError::Unavailable); + } + }; + } + + tracing::info!("Successfully aborted split"); + Ok(()) + } + + /// Infallible final stage of [`Self::tenant_shard_split`]: update the contents + /// of the tenant map to reflect the child shards that exist after the split. + fn tenant_shard_split_commit_inmem( + &self, + tenant_id: TenantId, + new_shard_count: ShardCount, + new_stripe_size: Option, + ) -> ( + TenantShardSplitResponse, + Vec<(TenantShardId, NodeId, ShardStripeSize)>, + ) { + let mut response = TenantShardSplitResponse { + new_shards: Vec::new(), + }; + let mut child_locations = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + + let parent_ids = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(shard_id, _)| *shard_id) + .collect::>(); + + let (_nodes, tenants, scheduler) = locked.parts_mut(); + for parent_id in parent_ids { + let child_ids = parent_id.split(new_shard_count); + + let (pageserver, generation, policy, parent_ident, config) = { + let mut old_state = tenants + .remove(&parent_id) + .expect("It was present, we just split it"); + + // A non-splitting state is impossible, because [`Self::tenant_shard_split`] holds + // a TenantId lock and passes it through to [`TenantShardSplitAbort`] in case of cleanup: + // nothing else can clear this. + assert!(matches!(old_state.splitting, SplitState::Splitting)); + + let old_attached = old_state.intent.get_attached().unwrap(); + old_state.intent.clear(scheduler); + let generation = old_state.generation.expect("Shard must have been attached"); + ( + old_attached, + generation, + old_state.policy, + old_state.shard, + old_state.config, + ) + }; + + for child in child_ids { + let mut child_shard = parent_ident; + child_shard.number = child.shard_number; + child_shard.count = child.shard_count; + if let Some(stripe_size) = new_stripe_size { + child_shard.stripe_size = stripe_size; + } + + let mut child_observed: HashMap = HashMap::new(); + child_observed.insert( + pageserver, + ObservedStateLocation { + conf: Some(attached_location_conf( + generation, + &child_shard, + &config, + matches!(policy, PlacementPolicy::Double(n) if n > 0), + )), + }, + ); + + let mut child_state = TenantState::new(child, child_shard, policy.clone()); + child_state.intent = IntentState::single(scheduler, Some(pageserver)); + child_state.observed = ObservedState { + locations: child_observed, + }; + child_state.generation = Some(generation); + child_state.config = config.clone(); + + // The child's TenantState::splitting is intentionally left at the default value of Idle, + // as at this point in the split process we have succeeded and this part is infallible: + // we will never need to do any special recovery from this state. + + child_locations.push((child, pageserver, child_shard.stripe_size)); + + if let Err(e) = child_state.schedule(scheduler) { + // This is not fatal, because we've implicitly already got an attached + // location for the child shard. Failure here just means we couldn't + // find a secondary (e.g. because cluster is overloaded). + tracing::warn!("Failed to schedule child shard {child}: {e}"); + } + + tenants.insert(child, child_state); + response.new_shards.push(child); + } + } + + (response, child_locations) + } + } + pub(crate) async fn tenant_shard_split( &self, tenant_id: TenantId, split_req: TenantShardSplitRequest, + ) -> Result { + // TODO: return 503 if we get stuck waiting for this lock + // (issue https://github.com/neondatabase/neon/issues/7108) + let _tenant_lock = self.tenant_op_locks.exclusive(tenant_id).await; + + let new_shard_count = ShardCount::new(split_req.new_shard_count); + let new_stripe_size = split_req.new_stripe_size; + + let r = self.do_tenant_shard_split(tenant_id, split_req).await; + + match r { + Ok(r) => Ok(r), + Err(ApiError::BadRequest(_)) => { + // A request validation error does not require rollback: we rejected it before we started making any changes: just + // return the error + r + } + Err(e) => { + // General case error handling: split might be part-done, we must do work to abort it. + tracing::warn!("Enqueuing background abort of split on {tenant_id}"); + self.abort_tx + .send(TenantShardSplitAbort { + tenant_id, + new_shard_count, + new_stripe_size, + _tenant_lock, + }) + // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. + .ok(); + Err(e) + } + } + } + + pub(crate) async fn do_tenant_shard_split( + &self, + tenant_id: TenantId, + split_req: TenantShardSplitRequest, ) -> Result { let mut policy = None; let mut shard_ident = None; @@ -2121,6 +2705,10 @@ impl Service { child_ids: Vec, } + fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest( + anyhow::anyhow!("failpoint") + ))); + // Validate input, and calculate which shards we will create let (old_shard_count, targets) = { @@ -2230,7 +2818,9 @@ impl Service { if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); } + shard_ident.stripe_size = new_stripe_size; + tracing::info!("applied stripe size {}", shard_ident.stripe_size.0); shard_ident } else { shard_ident.unwrap() @@ -2255,6 +2845,11 @@ impl Service { child_shard.number = child.shard_number; child_shard.count = child.shard_count; + tracing::info!( + "Create child shard persistence with stripe size {}", + shard_ident.stripe_size.0 + ); + this_child_tsps.push(TenantShardPersistence { tenant_id: child.tenant_id.to_string(), shard_number: child.shard_number.0 as i32, @@ -2293,6 +2888,9 @@ impl Service { _ => return Err(ApiError::InternalServerError(e.into())), } } + fail::fail_point!("shard-split-post-begin", |_| Err( + ApiError::InternalServerError(anyhow::anyhow!("failpoint")) + )); // Now that I have persisted the splitting state, apply it in-memory. This is infallible, so // callers may assume that if splitting is set in memory, then it was persisted, and if splitting @@ -2302,15 +2900,16 @@ impl Service { for target in &targets { if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) { parent_shard.splitting = SplitState::Splitting; + // Put the observed state to None, to reflect that it is indeterminate once we start the + // split operation. + parent_shard + .observed + .locations + .insert(target.node.get_id(), ObservedStateLocation { conf: None }); } } } - // FIXME: we have now committed the shard split state to the database, so any subsequent - // failure needs to roll it back. We will later wrap this function in logic to roll back - // the split if it fails. - // (https://github.com/neondatabase/neon/issues/6676) - // TODO: issue split calls concurrently (this only matters once we're splitting // N>1 shards into M shards -- initially we're usually splitting 1 shard into N). @@ -2332,6 +2931,10 @@ impl Service { .await .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?; + fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict( + "failpoint".to_string() + ))); + tracing::info!( "Split {} into {}", parent_id, @@ -2366,62 +2969,16 @@ impl Service { .complete_shard_split(tenant_id, old_shard_count) .await?; + fail::fail_point!("shard-split-post-complete", |_| Err( + ApiError::InternalServerError(anyhow::anyhow!("failpoint")) + )); + // Replace all the shards we just split with their children: this phase is infallible. - let mut response = TenantShardSplitResponse { - new_shards: Vec::new(), - }; - let mut child_locations = Vec::new(); - { - let mut locked = self.inner.write().unwrap(); - let (_nodes, tenants, scheduler) = locked.parts_mut(); - for target in targets { - let SplitTarget { - parent_id, - node: _node, - child_ids, - } = target; - let (pageserver, generation, config) = { - let mut old_state = tenants - .remove(&parent_id) - .expect("It was present, we just split it"); - let old_attached = old_state.intent.get_attached().unwrap(); - old_state.intent.clear(scheduler); - let generation = old_state.generation.expect("Shard must have been attached"); - (old_attached, generation, old_state.config.clone()) - }; - - for child in child_ids { - let mut child_shard = shard_ident; - child_shard.number = child.shard_number; - child_shard.count = child.shard_count; - - let mut child_observed: HashMap = HashMap::new(); - child_observed.insert( - pageserver, - ObservedStateLocation { - conf: Some(attached_location_conf(generation, &child_shard, &config)), - }, - ); - - let mut child_state = TenantState::new(child, child_shard, policy.clone()); - child_state.intent = IntentState::single(scheduler, Some(pageserver)); - child_state.observed = ObservedState { - locations: child_observed, - }; - child_state.generation = Some(generation); - child_state.config = config.clone(); - - // The child's TenantState::splitting is intentionally left at the default value of Idle, - // as at this point in the split process we have succeeded and this part is infallible: - // we will never need to do any special recovery from this state. - - child_locations.push((child, pageserver, child_shard.stripe_size)); - - tenants.insert(child, child_state); - response.new_shards.push(child); - } - } - } + let (response, child_locations) = self.tenant_shard_split_commit_inmem( + tenant_id, + ShardCount::new(split_req.new_shard_count), + split_req.new_stripe_size, + ); // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); @@ -2710,6 +3267,8 @@ impl Service { &self, register_req: NodeRegisterRequest, ) -> Result<(), ApiError> { + let _node_lock = self.node_op_locks.exclusive(register_req.node_id).await; + // Pre-check for an already-existing node { let locked = self.inner.read().unwrap(); @@ -2771,6 +3330,8 @@ impl Service { &self, config_req: NodeConfigureRequest, ) -> Result<(), ApiError> { + let _node_lock = self.node_op_locks.exclusive(config_req.node_id).await; + if let Some(scheduling) = config_req.scheduling { // Scheduling is a persistent part of Node: we must write updates to the database before // applying them in memory @@ -2779,6 +3340,37 @@ impl Service { .await?; } + // If we're activating a node, then before setting it active we must reconcile any shard locations + // on that node, in case it is out of sync, e.g. due to being unavailable during controller startup, + // by calling [`Self::node_activate_reconcile`] + // + // The transition we calculate here remains valid later in the function because we hold the op lock on the node: + // nothing else can mutate its availability while we run. + let availability_transition = if let Some(input_availability) = config_req.availability { + let (activate_node, availability_transition) = { + let locked = self.inner.read().unwrap(); + let Some(node) = locked.nodes.get(&config_req.node_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", config_req.node_id).into(), + )); + }; + + ( + node.clone(), + node.get_availability_transition(input_availability), + ) + }; + + if matches!(availability_transition, AvailabilityTransition::ToActive) { + self.node_activate_reconcile(activate_node, &_node_lock) + .await?; + } + availability_transition + } else { + AvailabilityTransition::Unchanged + }; + + // Apply changes from the request to our in-memory state for the Node let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); @@ -2790,11 +3382,9 @@ impl Service { )); }; - let availability_transition = if let Some(availability) = &config_req.availability { - node.set_availability(*availability) - } else { - AvailabilityTransition::Unchanged - }; + if let Some(availability) = &config_req.availability { + node.set_availability(*availability); + } if let Some(scheduling) = config_req.scheduling { node.set_scheduling(scheduling); @@ -2808,6 +3398,7 @@ impl Service { let new_nodes = Arc::new(new_nodes); + // Modify scheduling state for any Tenants that are affected by a change in the node's availability state. match availability_transition { AvailabilityTransition::ToOffline => { tracing::info!("Node {} transition to offline", config_req.node_id); diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs index 3c91e09ac3..39e557616d 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/control_plane/attachment_service/src/tenant_state.rs @@ -577,7 +577,12 @@ impl TenantState { .generation .expect("Attempted to enter attached state without a generation"); - let wanted_conf = attached_location_conf(generation, &self.shard, &self.config); + let wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + !self.intent.secondary.is_empty(), + ); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index bb8b1bb7e5..fc67f4cf8f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2108,6 +2108,16 @@ where R: std::future::Future, ApiError>> + Send + 'static, H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, { + if request.uri() != &"/v1/failpoints".parse::().unwrap() { + fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable( + "failpoint".into() + ))); + + fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError( + anyhow::anyhow!("failpoint") + ))); + } + // Spawn a new task to handle the request, to protect the handler from unexpected // async cancellations. Most pageserver functions are not async cancellation safe. // We arm a drop-guard, so that if Hyper drops the Future, we signal the task diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 26fcce1f38..7cf03d8fd6 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1443,6 +1443,35 @@ impl TenantManager { new_shard_count: ShardCount, new_stripe_size: Option, ctx: &RequestContext, + ) -> anyhow::Result> { + let r = self + .do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx) + .await; + if r.is_err() { + // Shard splitting might have left the original shard in a partially shut down state (it + // stops the shard's remote timeline client). Reset it to ensure we leave things in + // a working state. + if self.get(tenant_shard_id).is_some() { + tracing::warn!("Resetting {tenant_shard_id} after shard split failure"); + if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await { + // Log this error because our return value will still be the original error, not this one. This is + // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional + // (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or + // setting it broken probably won't help either. + tracing::error!("Failed to reset {tenant_shard_id}: {e}"); + } + } + } + + r + } + + pub(crate) async fn do_shard_split( + &self, + tenant_shard_id: TenantShardId, + new_shard_count: ShardCount, + new_stripe_size: Option, + ctx: &RequestContext, ) -> anyhow::Result> { let tenant = get_tenant(tenant_shard_id, true)?; @@ -1477,6 +1506,10 @@ impl TenantManager { .join(",") ); + fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + let parent_shard_identity = tenant.shard_identity; let parent_tenant_conf = tenant.get_tenant_conf(); let parent_generation = tenant.generation; @@ -1490,6 +1523,10 @@ impl TenantManager { return Err(e); } + fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + self.resources.deletion_queue_client.flush_advisory(); // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant @@ -1511,11 +1548,16 @@ impl TenantManager { anyhow::bail!("Detached parent shard in the middle of split!") } }; - + fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!( + "failpoint" + ))); // Optimization: hardlink layers from the parent into the children, so that they don't have to // re-download & duplicate the data referenced in their initial IndexPart self.shard_split_hardlink(parent, child_shards.clone()) .await?; + fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!( + "failpoint" + ))); // Take a snapshot of where the parent's WAL ingest had got to: we will wait for // child shards to reach this point. @@ -1555,6 +1597,10 @@ impl TenantManager { .await?; } + fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + // Phase 4: wait for child chards WAL ingest to catch up to target LSN for child_shard_id in &child_shards { let child_shard_id = *child_shard_id; @@ -1587,6 +1633,10 @@ impl TenantManager { timeline.timeline_id, target_lsn ); + + fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!( + "failpoint" + ))); if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await { // Failure here might mean shutdown, in any case this part is an optimization // and we shouldn't hold up the split operation. @@ -1632,6 +1682,10 @@ impl TenantManager { }, ); + fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + parent_slot_guard.drop_old_value()?; // Phase 6: Release the InProgress on the parent shard diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 200c9c3740..4b0c9ac71d 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -2,6 +2,7 @@ pytest_plugins = ( "fixtures.pg_version", "fixtures.parametrize", "fixtures.httpserver", + "fixtures.compute_reconfigure", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py new file mode 100644 index 0000000000..9dd66fe636 --- /dev/null +++ b/test_runner/fixtures/compute_reconfigure.py @@ -0,0 +1,62 @@ +import concurrent.futures +from typing import Any + +import pytest +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +from fixtures.log_helper import log +from fixtures.types import TenantId + + +class ComputeReconfigure: + def __init__(self, server): + self.server = server + self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" + self.workloads = {} + + def register_workload(self, workload): + self.workloads[workload.tenant_id] = workload + + +@pytest.fixture(scope="function") +def compute_reconfigure_listener(make_httpserver): + """ + This fixture exposes an HTTP listener for the storage controller to submit + compute notifications to us, instead of updating neon_local endpoints itself. + + Although storage controller can use neon_local directly, this causes problems when + the test is also concurrently modifying endpoints. Instead, configure storage controller + to send notifications up to this test code, which will route all endpoint updates + through Workload, which has a mutex to make concurrent updates safe. + """ + server = make_httpserver + + self = ComputeReconfigure(server) + + # Do neon_local endpoint reconfiguration in the background so that we can + # accept a healthy rate of calls into notify-attach. + reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1) + + def handler(request: Request): + assert request.json is not None + body: dict[str, Any] = request.json + log.info(f"notify-attach request: {body}") + + try: + workload = self.workloads[TenantId(body["tenant_id"])] + except KeyError: + pass + else: + # This causes the endpoint to query storage controller for its location, which + # is redundant since we already have it here, but this avoids extending the + # neon_local CLI to take full lists of locations + reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[no-any-return] + + return Response(status=200) + + self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + + yield self + reconfigure_threads.shutdown() + server.clear() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5b76e808d5..16ebc19698 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2177,6 +2177,23 @@ class NeonStorageController(MetricsGetter): ) log.info("storage controller passed consistency check") + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.request( + "PUT", + f"{self.env.storage_controller_api}/debug/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + headers=self.headers(TokenScope.ADMIN), + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + def __enter__(self) -> "NeonStorageController": return self diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 1d5394dc1d..e852281fcf 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -1,3 +1,4 @@ +import threading from typing import Optional from fixtures.log_helper import log @@ -11,6 +12,10 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import TenantId, TimelineId +# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex +# to ensure we don't do that: this enables running lots of Workloads in parallel safely. +ENDPOINT_LOCK = threading.Lock() + class Workload: """ @@ -41,17 +46,30 @@ class Workload: self._endpoint: Optional[Endpoint] = None + def reconfigure(self): + """ + Request the endpoint to reconfigure based on location reported by storage controller + """ + if self._endpoint is not None: + with ENDPOINT_LOCK: + self._endpoint.reconfigure() + def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: - if self._endpoint is None: - self._endpoint = self.env.endpoints.create( - self.branch_name, - tenant_id=self.tenant_id, - pageserver_id=pageserver_id, - endpoint_id="ep-workload", - ) - self._endpoint.start(pageserver_id=pageserver_id) - else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + # We may be running alongside other Workloads for different tenants. Full TTID is + # obnoxiously long for use here, but a cut-down version is still unique enough for tests. + endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}" + + with ENDPOINT_LOCK: + if self._endpoint is None: + self._endpoint = self.env.endpoints.create( + self.branch_name, + tenant_id=self.tenant_id, + pageserver_id=pageserver_id, + endpoint_id=endpoint_id, + ) + self._endpoint.start(pageserver_id=pageserver_id) + else: + self._endpoint.reconfigure(pageserver_id=pageserver_id) connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -94,7 +112,7 @@ class Workload: else: return False - def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True): + def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True): assert self.expect_rows >= n max_iters = 10 @@ -132,22 +150,28 @@ class Workload: ] ) - for tenant_shard_id, pageserver in tenant_get_shards( - self.env, self.tenant_id, pageserver_id - ): - last_flush_lsn = wait_for_last_flush_lsn( - self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id - ) - ps_http = pageserver.http_client() - wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) + if ingest: + # Wait for written data to be ingested by the pageserver + for tenant_shard_id, pageserver in tenant_get_shards( + self.env, self.tenant_id, pageserver_id + ): + last_flush_lsn = wait_for_last_flush_lsn( + self.env, + endpoint, + self.tenant_id, + self.timeline_id, + pageserver_id=pageserver_id, + ) + ps_http = pageserver.http_client() + wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) - if upload: - # force a checkpoint to trigger upload - ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id) - wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) - log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") - else: - log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") + if upload: + # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload) + ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id) + wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) + log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") + else: + log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") def validate(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 9309af066b..bdb9990a51 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1,10 +1,14 @@ import os -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union import pytest +import requests +from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( + NeonEnv, NeonEnvBuilder, + StorageControllerApiException, tenant_get_shards, ) from fixtures.remote_storage import s3_storage @@ -495,3 +499,337 @@ def test_sharding_ingest( # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance. assert huge_layer_count <= shard_count + + +class Failure: + pageserver_id: Optional[int] + + def apply(self, env: NeonEnv): + raise NotImplementedError() + + def clear(self, env: NeonEnv): + """ + Clear the failure, in a way that should enable the system to proceed + to a totally clean state (all nodes online and reconciled) + """ + raise NotImplementedError() + + def expect_available(self): + raise NotImplementedError() + + def can_mitigate(self): + """Whether Self.mitigate is available for use""" + return False + + def mitigate(self, env: NeonEnv): + """ + Mitigate the failure in a way that should allow shard split to + complete and service to resume, but does not guarantee to leave + the whole world in a clean state (e.g. an Offline node might have + junk LocationConfigs on it) + """ + raise NotImplementedError() + + def fails_forward(self, env: NeonEnv): + """ + If true, this failure results in a state that eventualy completes the split. + """ + return False + + def expect_exception(self): + """ + How do we expect a call to the split API to fail? + """ + return StorageControllerApiException + + +class PageserverFailpoint(Failure): + def __init__(self, failpoint, pageserver_id, mitigate): + self.failpoint = failpoint + self.pageserver_id = pageserver_id + self._mitigate = mitigate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.allowed_errors.extend( + [".*failpoint.*", ".*Resetting.*after shard split failure.*"] + ) + pageserver.http_client().configure_failpoints((self.failpoint, "return(1)")) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "off")) + if self._mitigate: + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Active"}) + + def expect_available(self): + return True + + def can_mitigate(self): + return self._mitigate + + def mitigate(self, env): + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"}) + + +class StorageControllerFailpoint(Failure): + def __init__(self, failpoint, action): + self.failpoint = failpoint + self.pageserver_id = None + self.action = action + + def apply(self, env: NeonEnv): + env.storage_controller.configure_failpoints((self.failpoint, self.action)) + + def clear(self, env: NeonEnv): + if "panic" in self.action: + log.info("Restarting storage controller after panic") + env.storage_controller.stop() + env.storage_controller.start() + else: + env.storage_controller.configure_failpoints((self.failpoint, "off")) + + def expect_available(self): + # Controller panics _do_ leave pageservers available, but our test code relies + # on using the locate API to update configurations in Workload, so we must skip + # these actions when the controller has been panicked. + return "panic" not in self.action + + def can_mitigate(self): + return False + + def fails_forward(self, env): + # Edge case: the very last failpoint that simulates a DB connection error, where + # the abort path will fail-forward and result in a complete split. + fail_forward = self.failpoint == "shard-split-post-complete" + + # If the failure was a panic, then if we expect split to eventually (after restart) + # complete, we must restart before checking that. + if fail_forward and "panic" in self.action: + log.info("Restarting storage controller after panic") + env.storage_controller.stop() + env.storage_controller.start() + + return fail_forward + + def expect_exception(self): + if "panic" in self.action: + return requests.exceptions.ConnectionError + else: + return StorageControllerApiException + + +class NodeKill(Failure): + def __init__(self, pageserver_id, mitigate): + self.pageserver_id = pageserver_id + self._mitigate = mitigate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=True) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.start() + + def expect_available(self): + return False + + def mitigate(self, env): + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"}) + + +class CompositeFailure(Failure): + """ + Wrapper for failures in multiple components (e.g. a failpoint in the storage controller, *and* + stop a pageserver to interfere with rollback) + """ + + def __init__(self, failures: list[Failure]): + self.failures = failures + + self.pageserver_id = None + for f in failures: + if f.pageserver_id is not None: + self.pageserver_id = f.pageserver_id + break + + def apply(self, env: NeonEnv): + for f in self.failures: + f.apply(env) + + def clear(self, env): + for f in self.failures: + f.clear(env) + + def expect_available(self): + return all(f.expect_available() for f in self.failures) + + def mitigate(self, env): + for f in self.failures: + f.mitigate(env) + + def expect_exception(self): + expect = set(f.expect_exception() for f in self.failures) + + # We can't give a sensible response if our failures have different expectations + assert len(expect) == 1 + + return list(expect)[0] + + +@pytest.mark.parametrize( + "failure", + [ + PageserverFailpoint("api-500", 1, False), + NodeKill(1, False), + PageserverFailpoint("api-500", 1, True), + NodeKill(1, True), + PageserverFailpoint("shard-split-pre-prepare", 1, False), + PageserverFailpoint("shard-split-post-prepare", 1, False), + PageserverFailpoint("shard-split-pre-hardlink", 1, False), + PageserverFailpoint("shard-split-post-hardlink", 1, False), + PageserverFailpoint("shard-split-post-child-conf", 1, False), + PageserverFailpoint("shard-split-lsn-wait", 1, False), + PageserverFailpoint("shard-split-pre-finish", 1, False), + StorageControllerFailpoint("shard-split-validation", "return(1)"), + StorageControllerFailpoint("shard-split-post-begin", "return(1)"), + StorageControllerFailpoint("shard-split-post-remote", "return(1)"), + StorageControllerFailpoint("shard-split-post-complete", "return(1)"), + StorageControllerFailpoint("shard-split-validation", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-begin", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-remote", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-complete", "panic(failpoint)"), + CompositeFailure( + [NodeKill(1, True), StorageControllerFailpoint("shard-split-post-begin", "return(1)")] + ), + CompositeFailure( + [NodeKill(1, False), StorageControllerFailpoint("shard-split-post-begin", "return(1)")] + ), + ], +) +def test_sharding_split_failures( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, + failure: Failure, +): + neon_env_builder.num_pageservers = 4 + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + initial_shard_count = 2 + split_shard_count = 4 + + env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + for ps in env.pageservers: + # When we do node failures and abandon a shard, it will de-facto have old generation and + # thereby be unable to publish remote consistent LSN updates + ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + + # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything + assert ( + failure.pageserver_id is None + or len( + env.get_pageserver(failure.pageserver_id) + .http_client() + .tenant_list_locations()["tenant_shards"] + ) + > 0 + ) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + # Put the environment into a failing state (exact meaning depends on `failure`) + failure.apply(env) + + with pytest.raises(failure.expect_exception()): + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + + # We expect that the overall operation will fail, but some split requests + # will have succeeded: the net result should be to return to a clean state, including + # detaching any child shards. + def assert_rolled_back(exclude_ps_id=None) -> None: + count = 0 + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + log.info(f"Shard {tenant_shard_id} seen on node {ps.id}") + assert tenant_shard_id.shard_count == initial_shard_count + count += 1 + assert count == initial_shard_count + + def assert_split_done(exclude_ps_id=None) -> None: + count = 0 + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + log.info(f"Shard {tenant_shard_id} seen on node {ps.id}") + assert tenant_shard_id.shard_count == split_shard_count + count += 1 + assert count == split_shard_count + + def finish_split(): + # Having failed+rolled back, we should be able to split again + # No failures this time; it will succeed + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + + workload.churn_rows(10) + workload.validate() + + if failure.expect_available(): + # Even though the split failed partway through, this should not have interrupted + # clients. Disable waiting for pageservers in the workload helper, because our + # failpoints may prevent API access. + # This only applies for failure modes that leave pageserver page_service API available. + workload.churn_rows(10, upload=False, ingest=False) + workload.validate() + + if failure.fails_forward(env): + log.info("Fail-forward failure, checking split eventually completes...") + # A failure type which results in eventual completion of the split + wait_until(30, 1, assert_split_done) + elif failure.can_mitigate(): + log.info("Mitigating failure...") + # Mitigation phase: we expect to be able to proceed with a successful shard split + failure.mitigate(env) + + # The split should appear to be rolled back from the point of view of all pageservers + # apart from the one that is offline + wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) + + finish_split() + wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) + + # Having cleared the failure, everything should converge to a pristine state + failure.clear(env) + wait_until(30, 1, assert_split_done) + else: + # Once we restore the faulty pageserver's API to good health, rollback should + # eventually complete. + log.info("Clearing failure...") + failure.clear(env) + + wait_until(30, 1, assert_rolled_back) + + # Having rolled back, the tenant should be working + workload.churn_rows(10) + workload.validate() + + # Splitting again should work, since we cleared the failure + finish_split() + assert_split_done() + + env.storage_controller.consistency_check()