pageserver: implement emergency mode for operating without control plane (#5469)

## Problem

Pageservers with `control_plane_api` configured require a control plane
to start up: in an incident this might be a problem.

## Summary of changes

Note to reviewers: most of the code churn in mgr.rs is the refactor
commit that enables the later emergency mode commit: you may want to
review commits separately.

- Add `control_plane_emergency_mode` configuration property
- Refactor init_tenant_mgr to separate loading configurations from the
main loop where we construct Tenant, so that the generations fetch can
peek at the configs in emergency mode.
- During startup, in emergency mode, attach any tenants that were
attached on their last run, using the same generation number.

Closes: #5381 
Closes: https://github.com/neondatabase/neon/issues/5492
This commit is contained in:
John Spray
2023-10-06 17:25:21 +01:00
committed by GitHub
parent 547914fe19
commit ea5a97e7b4
4 changed files with 303 additions and 172 deletions

View File

@@ -211,6 +211,10 @@ pub struct PageServerConf {
/// JWT token for use with the control plane API.
pub control_plane_api_token: Option<SecretString>,
/// If true, pageserver will make best-effort to operate without a control plane: only
/// for use in major incidents.
pub control_plane_emergency_mode: bool,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -288,6 +292,7 @@ struct PageServerConfigBuilder {
control_plane_api: BuilderValue<Option<Url>>,
control_plane_api_token: BuilderValue<Option<SecretString>>,
control_plane_emergency_mode: BuilderValue<bool>,
}
impl Default for PageServerConfigBuilder {
@@ -355,6 +360,7 @@ impl Default for PageServerConfigBuilder {
control_plane_api: Set(None),
control_plane_api_token: Set(None),
control_plane_emergency_mode: Set(false),
}
}
}
@@ -491,6 +497,10 @@ impl PageServerConfigBuilder {
self.control_plane_api_token = BuilderValue::Set(token)
}
pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
self.control_plane_emergency_mode = BuilderValue::Set(enabled)
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries
@@ -582,6 +592,9 @@ impl PageServerConfigBuilder {
control_plane_api_token: self
.control_plane_api_token
.ok_or(anyhow!("missing control_plane_api_token"))?,
control_plane_emergency_mode: self
.control_plane_emergency_mode
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
})
}
}
@@ -807,6 +820,10 @@ impl PageServerConf {
builder.control_plane_api_token(Some(parsed.into()))
}
},
"control_plane_emergency_mode" => {
builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
},
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -976,6 +993,7 @@ impl PageServerConf {
background_task_maximum_delay: Duration::ZERO,
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
}
}
}
@@ -1199,7 +1217,8 @@ background_task_maximum_delay = '334 s'
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
)?,
control_plane_api: None,
control_plane_api_token: None
control_plane_api_token: None,
control_plane_emergency_mode: false
},
"Correct defaults should be used when no config values are provided"
);
@@ -1255,7 +1274,8 @@ background_task_maximum_delay = '334 s'
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::from_secs(334),
control_plane_api: None,
control_plane_api_token: None
control_plane_api_token: None,
control_plane_emergency_mode: false
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -40,7 +40,6 @@ use validator::ValidatorQueueMessage;
use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
// TODO: adminstrative "panic button" config property to disable all deletions
// TODO: configurable for how long to wait before executing deletions
/// We aggregate object deletions from many tenants in one place, for several reasons:

View File

@@ -151,61 +151,86 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
/// Initialize repositories with locally available timelines.
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
/// are scheduled for download and added to the tenant once download is completed.
#[instrument(skip_all)]
pub async fn init_tenant_mgr(
fn emergency_generations(
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
) -> HashMap<TenantId, Generation> {
tenant_confs
.iter()
.filter_map(|(tid, lc)| {
let lc = match lc {
Ok(lc) => lc,
Err(_) => return None,
};
let gen = match &lc.mode {
LocationMode::Attached(alc) => Some(alc.generation),
LocationMode::Secondary(_) => None,
};
gen.map(|g| (*tid, g))
})
.collect()
}
async fn init_load_generations(
conf: &'static PageServerConf,
resources: TenantSharedResources,
init_order: InitializationOrder,
cancel: CancellationToken,
) -> anyhow::Result<()> {
// Scan local filesystem for attached tenants
let tenants_dir = conf.tenants_path();
let mut tenants = HashMap::new();
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
let tenant_generations = if let Some(client) = ControlPlaneClient::new(conf, &cancel) {
let result = match client.re_attach().await {
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
resources: &TenantSharedResources,
cancel: &CancellationToken,
) -> anyhow::Result<Option<HashMap<TenantId, Generation>>> {
let generations = if conf.control_plane_emergency_mode {
error!(
"Emergency mode! Tenants will be attached unsafely using their last known generation"
);
emergency_generations(tenant_confs)
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
info!("Calling control plane API to re-attach tenants");
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
match client.re_attach().await {
Ok(tenants) => tenants,
Err(RetryForeverError::ShuttingDown) => {
anyhow::bail!("Shut down while waiting for control plane re-attach response")
}
};
// The deletion queue needs to know about the startup attachment state to decide which (if any) stored
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
// are processed, even though we don't block on recovery completing here.
//
// Must only do this if remote storage is enabled, otherwise deletion queue
// is not running and channel push will fail.
if resources.remote_storage.is_some() {
resources
.deletion_queue_client
.recover(result.clone())
.await?;
}
Some(result)
} else {
info!("Control plane API not configured, tenant generations are disabled");
None
return Ok(None);
};
// The deletion queue needs to know about the startup attachment state to decide which (if any) stored
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
// are processed, even though we don't block on recovery completing here.
//
// Must only do this if remote storage is enabled, otherwise deletion queue
// is not running and channel push will fail.
if resources.remote_storage.is_some() {
resources
.deletion_queue_client
.recover(generations.clone())
.await?;
}
Ok(Some(generations))
}
/// Initial stage of load: walk the local tenants directory, clean up any temp files,
/// and load configurations for the tenants we found.
async fn init_load_tenant_configs(
conf: &'static PageServerConf,
) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
let tenants_dir = conf.tenants_path();
let mut dir_entries = tenants_dir
.read_dir_utf8()
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
let mut configs = HashMap::new();
loop {
match dir_entries.next() {
None => break,
Some(Ok(dir_entry)) => {
let tenant_dir_path = dir_entry.path().to_path_buf();
Some(Ok(dentry)) => {
let tenant_dir_path = dentry.path().to_path_buf();
if crate::is_temporary(&tenant_dir_path) {
info!("Found temporary tenant directory, removing: {tenant_dir_path}");
// No need to use safe_remove_tenant_dir_all because this is already
@@ -216,141 +241,158 @@ pub async fn init_tenant_mgr(
tenant_dir_path, e
);
}
} else {
// This case happens if we:
// * crash during attach before creating the attach marker file
// * crash during tenant delete before removing tenant directory
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
})?;
if is_empty {
info!("removing empty tenant directory {tenant_dir_path:?}");
if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path
)
}
continue;
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
continue;
}
let tenant_id = match tenant_dir_path
.file_name()
.unwrap_or_default()
.parse::<TenantId>()
{
Ok(id) => id,
Err(_) => {
warn!(
"Invalid tenant path (garbage in our repo directory?): {}",
tenant_dir_path
);
continue;
}
};
// Try loading the location configuration
let mut location_conf = match Tenant::load_tenant_config(conf, &tenant_id)
.context("load tenant config")
{
Ok(c) => c,
Err(e) => {
warn!("Marking tenant broken, failed to {e:#}");
tenants.insert(
tenant_id,
TenantSlot::Attached(Tenant::create_broken_tenant(
conf,
tenant_id,
"error loading tenant location configuration".to_string(),
)),
);
continue;
}
};
let generation = if let Some(generations) = &tenant_generations {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_id) {
*gen
} else {
match &location_conf.mode {
LocationMode::Secondary(_) => {
// We do not require the control plane's permission for secondary mode
// tenants, because they do no remote writes and hence require no
// generation number
info!("Loaded tenant {tenant_id} in secondary mode");
tenants.insert(tenant_id, TenantSlot::Secondary);
}
LocationMode::Attached(_) => {
// TODO: augment re-attach API to enable the control plane to
// instruct us about secondary attachments. That way, instead of throwing
// away local state, we can gracefully fall back to secondary here, if the control
// plane tells us so.
// (https://github.com/neondatabase/neon/issues/5377)
info!("Detaching tenant {tenant_id}, control plane omitted it in re-attach response");
if let Err(e) =
safe_remove_tenant_dir_all(&tenant_dir_path).await
{
error!(
"Failed to remove detached tenant directory '{}': {:?}",
tenant_dir_path, e
);
}
}
};
continue;
}
} else {
// Legacy mode: no generation information, any tenant present
// on local disk may activate
info!(
"Starting tenant {} in legacy mode, no generation",
tenant_dir_path
);
Generation::none()
};
// Presence of a generation number implies attachment: attach the tenant
// if it wasn't already, and apply the generation number.
location_conf.attach_in_generation(generation);
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
match schedule_local_tenant_processing(
conf,
tenant_id,
&tenant_dir_path,
AttachedTenantConf::try_from(location_conf)?,
resources.clone(),
Some(init_order.clone()),
&TENANTS,
&ctx,
) {
Ok(tenant) => {
tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
}
Err(e) => {
error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
}
}
continue;
}
// This case happens if we:
// * crash during attach before creating the attach marker file
// * crash during tenant delete before removing tenant directory
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
})?;
if is_empty {
info!("removing empty tenant directory {tenant_dir_path:?}");
if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path
)
}
continue;
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
continue;
}
let tenant_id = match tenant_dir_path
.file_name()
.unwrap_or_default()
.parse::<TenantId>()
{
Ok(id) => id,
Err(_) => {
warn!(
"Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",
);
continue;
}
};
configs.insert(tenant_id, Tenant::load_tenant_config(conf, &tenant_id));
}
Some(Err(e)) => {
// On error, print it, but continue with the other tenants. If we error out
// here, the pageserver startup fails altogether, causing outage for *all*
// tenants. That seems worse.
error!(
"Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
// An error listing the top level directory indicates serious problem
// with local filesystem: we will fail to load, and fail to start.
anyhow::bail!(e);
}
}
}
Ok(configs)
}
/// Initialize repositories with locally available timelines.
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
/// are scheduled for download and added to the tenant once download is completed.
#[instrument(skip_all)]
pub async fn init_tenant_mgr(
conf: &'static PageServerConf,
resources: TenantSharedResources,
init_order: InitializationOrder,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let mut tenants = HashMap::new();
let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
// Scan local filesystem for attached tenants
let tenant_configs = init_load_tenant_configs(conf).await?;
// Determine which tenants are to be attached
let tenant_generations =
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
// Construct `Tenant` objects and start them running
for (tenant_id, location_conf) in tenant_configs {
let tenant_dir_path = conf.tenant_path(&tenant_id);
let mut location_conf = match location_conf {
Ok(l) => l,
Err(e) => {
warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");
tenants.insert(
tenant_id,
TenantSlot::Attached(Tenant::create_broken_tenant(
conf,
tenant_id,
format!("{}", e),
)),
);
continue;
}
};
let generation = if let Some(generations) = &tenant_generations {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_id) {
*gen
} else {
match &location_conf.mode {
LocationMode::Secondary(_) => {
// We do not require the control plane's permission for secondary mode
// tenants, because they do no remote writes and hence require no
// generation number
info!(%tenant_id, "Loaded tenant in secondary mode");
tenants.insert(tenant_id, TenantSlot::Secondary);
}
LocationMode::Attached(_) => {
// TODO: augment re-attach API to enable the control plane to
// instruct us about secondary attachments. That way, instead of throwing
// away local state, we can gracefully fall back to secondary here, if the control
// plane tells us so.
// (https://github.com/neondatabase/neon/issues/5377)
info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response");
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
error!(%tenant_id,
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
);
}
}
};
continue;
}
} else {
// Legacy mode: no generation information, any tenant present
// on local disk may activate
info!(%tenant_id, "Starting tenant in legacy mode, no generation",);
Generation::none()
};
// Presence of a generation number implies attachment: attach the tenant
// if it wasn't already, and apply the generation number.
location_conf.attach_in_generation(generation);
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
match schedule_local_tenant_processing(
conf,
tenant_id,
&tenant_dir_path,
AttachedTenantConf::try_from(location_conf)?,
resources.clone(),
Some(init_order.clone()),
&TENANTS,
&ctx,
) {
Ok(tenant) => {
tenants.insert(tenant.tenant_id(), TenantSlot::Attached(tenant));
}
Err(e) => {
error!(%tenant_id, "Failed to start tenant: {e:#}");
}
}
}

View File

@@ -81,7 +81,7 @@ def generate_uploads_and_deletions(
f"""
INSERT INTO foo (id, val)
SELECT g, '{data}'
FROM generate_series(1, 20000) g
FROM generate_series(1, 200) g
ON CONFLICT (id) DO UPDATE
SET val = EXCLUDED.val
""",
@@ -378,3 +378,73 @@ def test_deletion_queue_recovery(
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
ps_http = env.pageserver.http_client()
generate_uploads_and_deletions(env)
env.pageserver.allowed_errors.extend(
[
# When the pageserver can't reach the control plane, it will complain
".*calling control plane generation validation API failed.*",
# Emergency mode is a big deal, we log errors whenever it is used.
".*Emergency mode!.*",
]
)
# Simulate a major incident: the control plane goes offline
assert env.attachment_service is not None
env.attachment_service.stop()
# Remember how many validations had happened before the control plane went offline
validated = get_deletion_queue_validated(ps_http)
generate_uploads_and_deletions(env, init=False)
# The running pageserver should stop progressing deletions
time.sleep(10)
assert get_deletion_queue_validated(ps_http) == validated
# Restart the pageserver: ordinarily we would _avoid_ doing this during such an
# incident, but it might be unavoidable: if so, we want to be able to start up
# and serve clients.
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
env.pageserver.start(
overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
)
# The pageserver should provide service to clients
generate_uploads_and_deletions(env, init=False)
# The pageserver should neither validate nor execute any deletions, it should have
# loaded the DeletionLists from before though
time.sleep(10)
assert get_deletion_queue_depth(ps_http) > 0
assert get_deletion_queue_validated(ps_http) == 0
assert get_deletion_queue_executed(ps_http) == 0
# When the control plane comes back up, normal service should resume
env.attachment_service.start()
ps_http.deletion_queue_flush(execute=True)
assert get_deletion_queue_depth(ps_http) == 0
assert get_deletion_queue_validated(ps_http) > 0
assert get_deletion_queue_executed(ps_http) > 0
# The pageserver should work fine when subsequently restarted in non-emergency mode
env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
env.pageserver.start()
generate_uploads_and_deletions(env, init=False)
ps_http.deletion_queue_flush(execute=True)
assert get_deletion_queue_depth(ps_http) == 0
assert get_deletion_queue_validated(ps_http) > 0
assert get_deletion_queue_executed(ps_http) > 0