diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 1460929e3a..fe62a7299a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -211,6 +211,10 @@ pub struct PageServerConf { /// JWT token for use with the control plane API. pub control_plane_api_token: Option, + + /// If true, pageserver will make best-effort to operate without a control plane: only + /// for use in major incidents. + pub control_plane_emergency_mode: bool, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -288,6 +292,7 @@ struct PageServerConfigBuilder { control_plane_api: BuilderValue>, control_plane_api_token: BuilderValue>, + control_plane_emergency_mode: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -355,6 +360,7 @@ impl Default for PageServerConfigBuilder { control_plane_api: Set(None), control_plane_api_token: Set(None), + control_plane_emergency_mode: Set(false), } } } @@ -491,6 +497,10 @@ impl PageServerConfigBuilder { self.control_plane_api_token = BuilderValue::Set(token) } + pub fn control_plane_emergency_mode(&mut self, enabled: bool) { + self.control_plane_emergency_mode = BuilderValue::Set(enabled) + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_size_logical_size_queries = self .concurrent_tenant_size_logical_size_queries @@ -582,6 +592,9 @@ impl PageServerConfigBuilder { control_plane_api_token: self .control_plane_api_token .ok_or(anyhow!("missing control_plane_api_token"))?, + control_plane_emergency_mode: self + .control_plane_emergency_mode + .ok_or(anyhow!("missing control_plane_emergency_mode"))?, }) } } @@ -807,6 +820,10 @@ impl PageServerConf { builder.control_plane_api_token(Some(parsed.into())) } }, + "control_plane_emergency_mode" => { + builder.control_plane_emergency_mode(parse_toml_bool(key, item)?) + + }, _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -976,6 +993,7 @@ impl PageServerConf { background_task_maximum_delay: Duration::ZERO, control_plane_api: None, control_plane_api_token: None, + control_plane_emergency_mode: false, } } } @@ -1199,7 +1217,8 @@ background_task_maximum_delay = '334 s' defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY )?, control_plane_api: None, - control_plane_api_token: None + control_plane_api_token: None, + control_plane_emergency_mode: false }, "Correct defaults should be used when no config values are provided" ); @@ -1255,7 +1274,8 @@ background_task_maximum_delay = '334 s' ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::from_secs(334), control_plane_api: None, - control_plane_api_token: None + control_plane_api_token: None, + control_plane_emergency_mode: false }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index e49ed34215..ae66236499 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -40,7 +40,6 @@ use validator::ValidatorQueueMessage; use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; -// TODO: adminstrative "panic button" config property to disable all deletions // TODO: configurable for how long to wait before executing deletions /// We aggregate object deletions from many tenants in one place, for several reasons: diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 7fbcb5041a..a92fbccdea 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -151,61 +151,86 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result> = Lazy::new(|| RwLock::new(TenantsMap::Initializing)); -/// Initialize repositories with locally available timelines. -/// Timelines that are only partially available locally (remote storage has more data than this pageserver) -/// are scheduled for download and added to the tenant once download is completed. -#[instrument(skip_all)] -pub async fn init_tenant_mgr( +fn emergency_generations( + tenant_confs: &HashMap>, +) -> HashMap { + tenant_confs + .iter() + .filter_map(|(tid, lc)| { + let lc = match lc { + Ok(lc) => lc, + Err(_) => return None, + }; + let gen = match &lc.mode { + LocationMode::Attached(alc) => Some(alc.generation), + LocationMode::Secondary(_) => None, + }; + + gen.map(|g| (*tid, g)) + }) + .collect() +} + +async fn init_load_generations( conf: &'static PageServerConf, - resources: TenantSharedResources, - init_order: InitializationOrder, - cancel: CancellationToken, -) -> anyhow::Result<()> { - // Scan local filesystem for attached tenants - let tenants_dir = conf.tenants_path(); - - let mut tenants = HashMap::new(); - - // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. - let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) { - let result = match client.re_attach().await { + tenant_confs: &HashMap>, + resources: &TenantSharedResources, + cancel: &CancellationToken, +) -> anyhow::Result>> { + let generations = if conf.control_plane_emergency_mode { + error!( + "Emergency mode! Tenants will be attached unsafely using their last known generation" + ); + emergency_generations(tenant_confs) + } else if let Some(client) = ControlPlaneClient::new(conf, cancel) { + info!("Calling control plane API to re-attach tenants"); + // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. + match client.re_attach().await { Ok(tenants) => tenants, Err(RetryForeverError::ShuttingDown) => { anyhow::bail!("Shut down while waiting for control plane re-attach response") } - }; - - // The deletion queue needs to know about the startup attachment state to decide which (if any) stored - // deletion list entries may still be valid. We provide that by pushing a recovery operation into - // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions - // are processed, even though we don't block on recovery completing here. - // - // Must only do this if remote storage is enabled, otherwise deletion queue - // is not running and channel push will fail. - if resources.remote_storage.is_some() { - resources - .deletion_queue_client - .recover(result.clone()) - .await?; } - - Some(result) } else { info!("Control plane API not configured, tenant generations are disabled"); - None + return Ok(None); }; + // The deletion queue needs to know about the startup attachment state to decide which (if any) stored + // deletion list entries may still be valid. We provide that by pushing a recovery operation into + // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions + // are processed, even though we don't block on recovery completing here. + // + // Must only do this if remote storage is enabled, otherwise deletion queue + // is not running and channel push will fail. + if resources.remote_storage.is_some() { + resources + .deletion_queue_client + .recover(generations.clone()) + .await?; + } + + Ok(Some(generations)) +} + +/// Initial stage of load: walk the local tenants directory, clean up any temp files, +/// and load configurations for the tenants we found. +async fn init_load_tenant_configs( + conf: &'static PageServerConf, +) -> anyhow::Result>> { + let tenants_dir = conf.tenants_path(); + let mut dir_entries = tenants_dir .read_dir_utf8() .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; - let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); + let mut configs = HashMap::new(); loop { match dir_entries.next() { None => break, - Some(Ok(dir_entry)) => { - let tenant_dir_path = dir_entry.path().to_path_buf(); + Some(Ok(dentry)) => { + let tenant_dir_path = dentry.path().to_path_buf(); if crate::is_temporary(&tenant_dir_path) { info!("Found temporary tenant directory, removing: {tenant_dir_path}"); // No need to use safe_remove_tenant_dir_all because this is already @@ -216,141 +241,158 @@ pub async fn init_tenant_mgr( tenant_dir_path, e ); } - } else { - // This case happens if we: - // * crash during attach before creating the attach marker file - // * crash during tenant delete before removing tenant directory - let is_empty = tenant_dir_path.is_empty_dir().with_context(|| { - format!("Failed to check whether {tenant_dir_path:?} is an empty dir") - })?; - if is_empty { - info!("removing empty tenant directory {tenant_dir_path:?}"); - if let Err(e) = fs::remove_dir(&tenant_dir_path).await { - error!( - "Failed to remove empty tenant directory '{}': {e:#}", - tenant_dir_path - ) - } - continue; - } - - let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); - if tenant_ignore_mark_file.exists() { - info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); - continue; - } - - let tenant_id = match tenant_dir_path - .file_name() - .unwrap_or_default() - .parse::() - { - Ok(id) => id, - Err(_) => { - warn!( - "Invalid tenant path (garbage in our repo directory?): {}", - tenant_dir_path - ); - continue; - } - }; - - // Try loading the location configuration - let mut location_conf = match Tenant::load_tenant_config(conf, &tenant_id) - .context("load tenant config") - { - Ok(c) => c, - Err(e) => { - warn!("Marking tenant broken, failed to {e:#}"); - - tenants.insert( - tenant_id, - TenantSlot::Attached(Tenant::create_broken_tenant( - conf, - tenant_id, - "error loading tenant location configuration".to_string(), - )), - ); - - continue; - } - }; - - let generation = if let Some(generations) = &tenant_generations { - // We have a generation map: treat it as the authority for whether - // this tenant is really attached. - if let Some(gen) = generations.get(&tenant_id) { - *gen - } else { - match &location_conf.mode { - LocationMode::Secondary(_) => { - // We do not require the control plane's permission for secondary mode - // tenants, because they do no remote writes and hence require no - // generation number - info!("Loaded tenant {tenant_id} in secondary mode"); - tenants.insert(tenant_id, TenantSlot::Secondary); - } - LocationMode::Attached(_) => { - // TODO: augment re-attach API to enable the control plane to - // instruct us about secondary attachments. That way, instead of throwing - // away local state, we can gracefully fall back to secondary here, if the control - // plane tells us so. - // (https://github.com/neondatabase/neon/issues/5377) - info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response"); - if let Err(e) = - safe_remove_tenant_dir_all(&tenant_dir_path).await - { - error!( - "Failed to remove detached tenant directory '{}': {:?}", - tenant_dir_path, e - ); - } - } - }; - - continue; - } - } else { - // Legacy mode: no generation information, any tenant present - // on local disk may activate - info!( - "Starting tenant {} in legacy mode, no generation", - tenant_dir_path - ); - Generation::none() - }; - - // Presence of a generation number implies attachment: attach the tenant - // if it wasn't already, and apply the generation number. - location_conf.attach_in_generation(generation); - Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; - - match schedule_local_tenant_processing( - conf, - tenant_id, - &tenant_dir_path, - AttachedTenantConf::try_from(location_conf)?, - resources.clone(), - Some(init_order.clone()), - &TENANTS, - &ctx, - ) { - Ok(tenant) => { - tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant)); - } - Err(e) => { - error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}"); - } - } + continue; } + + // This case happens if we: + // * crash during attach before creating the attach marker file + // * crash during tenant delete before removing tenant directory + let is_empty = tenant_dir_path.is_empty_dir().with_context(|| { + format!("Failed to check whether {tenant_dir_path:?} is an empty dir") + })?; + if is_empty { + info!("removing empty tenant directory {tenant_dir_path:?}"); + if let Err(e) = fs::remove_dir(&tenant_dir_path).await { + error!( + "Failed to remove empty tenant directory '{}': {e:#}", + tenant_dir_path + ) + } + continue; + } + + let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); + if tenant_ignore_mark_file.exists() { + info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); + continue; + } + + let tenant_id = match tenant_dir_path + .file_name() + .unwrap_or_default() + .parse::() + { + Ok(id) => id, + Err(_) => { + warn!( + "Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}", + ); + continue; + } + }; + + configs.insert(tenant_id, Tenant::load_tenant_config(conf, &tenant_id)); } Some(Err(e)) => { - // On error, print it, but continue with the other tenants. If we error out - // here, the pageserver startup fails altogether, causing outage for *all* - // tenants. That seems worse. - error!( - "Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}" + // An error listing the top level directory indicates serious problem + // with local filesystem: we will fail to load, and fail to start. + anyhow::bail!(e); + } + } + } + Ok(configs) +} + +/// Initialize repositories with locally available timelines. +/// Timelines that are only partially available locally (remote storage has more data than this pageserver) +/// are scheduled for download and added to the tenant once download is completed. +#[instrument(skip_all)] +pub async fn init_tenant_mgr( + conf: &'static PageServerConf, + resources: TenantSharedResources, + init_order: InitializationOrder, + cancel: CancellationToken, +) -> anyhow::Result<()> { + let mut tenants = HashMap::new(); + + let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); + + // Scan local filesystem for attached tenants + let tenant_configs = init_load_tenant_configs(conf).await?; + + // Determine which tenants are to be attached + let tenant_generations = + init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + + // Construct `Tenant` objects and start them running + for (tenant_id, location_conf) in tenant_configs { + let tenant_dir_path = conf.tenant_path(&tenant_id); + + let mut location_conf = match location_conf { + Ok(l) => l, + Err(e) => { + warn!(%tenant_id, "Marking tenant broken, failed to {e:#}"); + + tenants.insert( + tenant_id, + TenantSlot::Attached(Tenant::create_broken_tenant( + conf, + tenant_id, + format!("{}", e), + )), ); + continue; + } + }; + + let generation = if let Some(generations) = &tenant_generations { + // We have a generation map: treat it as the authority for whether + // this tenant is really attached. + if let Some(gen) = generations.get(&tenant_id) { + *gen + } else { + match &location_conf.mode { + LocationMode::Secondary(_) => { + // We do not require the control plane's permission for secondary mode + // tenants, because they do no remote writes and hence require no + // generation number + info!(%tenant_id, "Loaded tenant in secondary mode"); + tenants.insert(tenant_id, TenantSlot::Secondary); + } + LocationMode::Attached(_) => { + // TODO: augment re-attach API to enable the control plane to + // instruct us about secondary attachments. That way, instead of throwing + // away local state, we can gracefully fall back to secondary here, if the control + // plane tells us so. + // (https://github.com/neondatabase/neon/issues/5377) + info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response"); + if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { + error!(%tenant_id, + "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", + ); + } + } + }; + + continue; + } + } else { + // Legacy mode: no generation information, any tenant present + // on local disk may activate + info!(%tenant_id, "Starting tenant in legacy mode, no generation",); + Generation::none() + }; + + // Presence of a generation number implies attachment: attach the tenant + // if it wasn't already, and apply the generation number. + location_conf.attach_in_generation(generation); + Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?; + + match schedule_local_tenant_processing( + conf, + tenant_id, + &tenant_dir_path, + AttachedTenantConf::try_from(location_conf)?, + resources.clone(), + Some(init_order.clone()), + &TENANTS, + &ctx, + ) { + Ok(tenant) => { + tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant)); + } + Err(e) => { + error!(%tenant_id, "Failed to start tenant: {e:#}"); } } } diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index abc2c79ac9..64c471fb92 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -81,7 +81,7 @@ def generate_uploads_and_deletions( f""" INSERT INTO foo (id, val) SELECT g, '{data}' - FROM generate_series(1, 20000) g + FROM generate_series(1, 200) g ON CONFLICT (id) DO UPDATE SET val = EXCLUDED.val """, @@ -378,3 +378,73 @@ def test_deletion_queue_recovery( assert get_deletion_queue_unexpected_errors(ps_http) == 0 assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 + + +def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + neon_env_builder.enable_generations = True + neon_env_builder.enable_pageserver_remote_storage( + RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + ps_http = env.pageserver.http_client() + + generate_uploads_and_deletions(env) + + env.pageserver.allowed_errors.extend( + [ + # When the pageserver can't reach the control plane, it will complain + ".*calling control plane generation validation API failed.*", + # Emergency mode is a big deal, we log errors whenever it is used. + ".*Emergency mode!.*", + ] + ) + + # Simulate a major incident: the control plane goes offline + assert env.attachment_service is not None + env.attachment_service.stop() + + # Remember how many validations had happened before the control plane went offline + validated = get_deletion_queue_validated(ps_http) + + generate_uploads_and_deletions(env, init=False) + + # The running pageserver should stop progressing deletions + time.sleep(10) + assert get_deletion_queue_validated(ps_http) == validated + + # Restart the pageserver: ordinarily we would _avoid_ doing this during such an + # incident, but it might be unavoidable: if so, we want to be able to start up + # and serve clients. + env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP + env.pageserver.start( + overrides=("--pageserver-config-override=control_plane_emergency_mode=true",) + ) + + # The pageserver should provide service to clients + generate_uploads_and_deletions(env, init=False) + + # The pageserver should neither validate nor execute any deletions, it should have + # loaded the DeletionLists from before though + time.sleep(10) + assert get_deletion_queue_depth(ps_http) > 0 + assert get_deletion_queue_validated(ps_http) == 0 + assert get_deletion_queue_executed(ps_http) == 0 + + # When the control plane comes back up, normal service should resume + env.attachment_service.start() + + ps_http.deletion_queue_flush(execute=True) + assert get_deletion_queue_depth(ps_http) == 0 + assert get_deletion_queue_validated(ps_http) > 0 + assert get_deletion_queue_executed(ps_http) > 0 + + # The pageserver should work fine when subsequently restarted in non-emergency mode + env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP + env.pageserver.start() + + generate_uploads_and_deletions(env, init=False) + ps_http.deletion_queue_flush(execute=True) + assert get_deletion_queue_depth(ps_http) == 0 + assert get_deletion_queue_validated(ps_http) > 0 + assert get_deletion_queue_executed(ps_http) > 0