mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 13:10:38 +00:00
pageserver: extend /re-attach response to include tenant mode (#6941)
This change improves the resilience of the system to unclean restarts. Previously, re-attach responses only included attached tenants - If the pageserver had local state for a secondary location, it would remain, but with no guarantee that it was still _meant_ to be there. After this change, the pageserver will only retain secondary locations if the /re-attach response indicates that they should still be there. - If the pageserver had local state for an attached location that was omitted from a re-attach response, it would be entirely detached. This is wasteful in a typical HA setup, where an offline node's tenants might have been re-attached elsewhere before it restarts, but the offline node's location should revert to a secondary location rather than being wiped. Including secondary tenants in the re-attach response enables the pageserver to avoid throwing away local state unnecessarily. In this PR: - The re-attach items are extended with a 'mode' field. - Storage controller populates 'mode' - Pageserver interprets it (default is attached if missing) to construct either a SecondaryTenant or a Tenant. - A new test exercises both cases.
This commit is contained in:
@@ -1394,7 +1394,8 @@ impl Service {
|
||||
incremented_generations.len()
|
||||
);
|
||||
|
||||
// Apply the updated generation to our in-memory state
|
||||
// Apply the updated generation to our in-memory state, and
|
||||
// gather discover secondary locations.
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
@@ -1402,62 +1403,65 @@ impl Service {
|
||||
tenants: Vec::new(),
|
||||
};
|
||||
|
||||
for (tenant_shard_id, new_gen) in incremented_generations {
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: tenant_shard_id,
|
||||
gen: new_gen.into().unwrap(),
|
||||
});
|
||||
// Apply the new generation number to our in-memory state
|
||||
let shard_state = tenants.get_mut(&tenant_shard_id);
|
||||
let Some(shard_state) = shard_state else {
|
||||
// Not fatal. This edge case requires a re-attach to happen
|
||||
// between inserting a new tenant shard in to the database, and updating our in-memory
|
||||
// state to know about the shard, _and_ that the state inserted to the database referenced
|
||||
// a pageserver. Should never happen, but handle it rather than panicking, since it should
|
||||
// be harmless.
|
||||
tracing::error!(
|
||||
"Shard {} is in database for node {} but not in-memory state",
|
||||
tenant_shard_id,
|
||||
reattach_req.node_id
|
||||
);
|
||||
continue;
|
||||
};
|
||||
// TODO: cancel/restart any running reconciliation for this tenant, it might be trying
|
||||
// to call location_conf API with an old generation. Wait for cancellation to complete
|
||||
// before responding to this request. Requires well implemented CancellationToken logic
|
||||
// all the way to where we call location_conf. Even then, there can still be a location_conf
|
||||
// request in flight over the network: TODO handle that by making location_conf API refuse
|
||||
// to go backward in generations.
|
||||
|
||||
// If [`Persistence::re_attach`] selected this shard, it must have alread
|
||||
// had a generation set.
|
||||
debug_assert!(shard_state.generation.is_some());
|
||||
let Some(old_gen) = shard_state.generation else {
|
||||
// Should never happen: would only return incremented generation
|
||||
// for a tenant that already had a non-null generation.
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Generation must be set while re-attaching"
|
||||
)));
|
||||
};
|
||||
shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
|
||||
if let Some(observed) = shard_state
|
||||
.observed
|
||||
.locations
|
||||
.get_mut(&reattach_req.node_id)
|
||||
{
|
||||
if let Some(conf) = observed.conf.as_mut() {
|
||||
conf.generation = new_gen.into();
|
||||
// Scan through all shards, applying updates for ones where we updated generation
|
||||
// and identifying shards that intend to have a secondary location on this node.
|
||||
for (tenant_shard_id, shard) in tenants {
|
||||
if let Some(new_gen) = incremented_generations.get(tenant_shard_id) {
|
||||
let new_gen = *new_gen;
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *tenant_shard_id,
|
||||
gen: Some(new_gen.into().unwrap()),
|
||||
// A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`]
|
||||
// execution. If a pageserver is restarted during that process, then the reconcile pass will
|
||||
// fail, and start from scratch, so it doesn't make sense for us to try and preserve
|
||||
// the stale/multi states at this point.
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
});
|
||||
|
||||
shard.generation = std::cmp::max(shard.generation, Some(new_gen));
|
||||
if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) {
|
||||
// Why can we update `observed` even though we're not sure our response will be received
|
||||
// by the pageserver? Because the pageserver will not proceed with startup until
|
||||
// it has processed response: if it loses it, we'll see another request and increment
|
||||
// generation again, avoiding any uncertainty about dirtiness of tenant's state.
|
||||
if let Some(conf) = observed.conf.as_mut() {
|
||||
conf.generation = new_gen.into();
|
||||
}
|
||||
} else {
|
||||
// This node has no observed state for the shard: perhaps it was offline
|
||||
// when the pageserver restarted. Insert a None, so that the Reconciler
|
||||
// will be prompted to learn the location's state before it makes changes.
|
||||
shard
|
||||
.observed
|
||||
.locations
|
||||
.insert(reattach_req.node_id, ObservedStateLocation { conf: None });
|
||||
}
|
||||
} else {
|
||||
// This node has no observed state for the shard: perhaps it was offline
|
||||
// when the pageserver restarted. Insert a None, so that the Reconciler
|
||||
// will be prompted to learn the location's state before it makes changes.
|
||||
shard_state
|
||||
.observed
|
||||
.locations
|
||||
.insert(reattach_req.node_id, ObservedStateLocation { conf: None });
|
||||
}
|
||||
} else if shard.intent.get_secondary().contains(&reattach_req.node_id) {
|
||||
// Ordering: pageserver will not accept /location_config requests until it has
|
||||
// finished processing the response from re-attach. So we can update our in-memory state
|
||||
// now, and be confident that we are not stamping on the result of some later location config.
|
||||
// TODO: however, we are not strictly ordered wrt ReconcileResults queue,
|
||||
// so we might update observed state here, and then get over-written by some racing
|
||||
// ReconcileResult. The impact is low however, since we have set state on pageserver something
|
||||
// that matches intent, so worst case if we race then we end up doing a spurious reconcile.
|
||||
|
||||
// TODO: cancel/restart any running reconciliation for this tenant, it might be trying
|
||||
// to call location_conf API with an old generation. Wait for cancellation to complete
|
||||
// before responding to this request. Requires well implemented CancellationToken logic
|
||||
// all the way to where we call location_conf. Even then, there can still be a location_conf
|
||||
// request in flight over the network: TODO handle that by making location_conf API refuse
|
||||
// to go backward in generations.
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *tenant_shard_id,
|
||||
gen: None,
|
||||
mode: LocationConfigMode::Secondary,
|
||||
});
|
||||
|
||||
// We must not update observed, because we have no guarantee that our
|
||||
// response will be received by the pageserver. This could leave it
|
||||
// falsely dirty, but the resulting reconcile should be idempotent.
|
||||
}
|
||||
}
|
||||
|
||||
// We consider a node Active once we have composed a re-attach response, but we
|
||||
@@ -3446,7 +3450,7 @@ impl Service {
|
||||
if let Some(waiter) = waiter {
|
||||
waiter.wait_timeout(RECONCILE_TIMEOUT).await?;
|
||||
} else {
|
||||
tracing::warn!("Migration is a no-op");
|
||||
tracing::info!("Migration is a no-op");
|
||||
}
|
||||
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
|
||||
@@ -6,7 +6,9 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
|
||||
use crate::{
|
||||
controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
|
||||
};
|
||||
|
||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||
/// startup.
|
||||
@@ -20,12 +22,20 @@ pub struct ReAttachRequest {
|
||||
pub register: Option<NodeRegisterRequest>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
pub id: TenantShardId,
|
||||
pub gen: u32,
|
||||
fn default_mode() -> LocationConfigMode {
|
||||
LocationConfigMode::AttachedSingle
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
pub id: TenantShardId,
|
||||
/// Mandatory if LocationConfigMode is None or set to an Attached* mode
|
||||
pub gen: Option<u32>,
|
||||
|
||||
/// Default value only for backward compat: this field should be set
|
||||
#[serde(default = "default_mode")]
|
||||
pub mode: LocationConfigMode,
|
||||
}
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponse {
|
||||
pub tenants: Vec<ReAttachResponseTenant>,
|
||||
|
||||
@@ -5,7 +5,8 @@ use pageserver_api::{
|
||||
controller_api::NodeRegisterRequest,
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
|
||||
ValidateRequestTenant, ValidateResponse,
|
||||
},
|
||||
};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
@@ -37,7 +38,9 @@ pub trait ControlPlaneGenerationsApi {
|
||||
fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
||||
) -> impl Future<
|
||||
Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
|
||||
> + Send;
|
||||
fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantShardId, Generation)>,
|
||||
@@ -118,7 +121,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
@@ -181,7 +184,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|t| (t.id, Generation::new(t.gen)))
|
||||
.map(|rart| (rart.id, rart))
|
||||
.collect::<HashMap<_, _>>())
|
||||
}
|
||||
|
||||
|
||||
@@ -724,8 +724,8 @@ impl DeletionQueue {
|
||||
mod test {
|
||||
use camino::Utf8Path;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::io::ErrorKind;
|
||||
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use std::{io::ErrorKind, time::Duration};
|
||||
use tracing::info;
|
||||
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
@@ -834,9 +834,10 @@ mod test {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
_conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
async fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantShardId, Generation)>,
|
||||
|
||||
@@ -202,6 +202,13 @@ pub(super) struct AttachedTenantConf {
|
||||
}
|
||||
|
||||
impl AttachedTenantConf {
|
||||
fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
|
||||
Self {
|
||||
tenant_conf,
|
||||
location,
|
||||
}
|
||||
}
|
||||
|
||||
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => Ok(Self {
|
||||
|
||||
@@ -196,16 +196,17 @@ impl LocationConf {
|
||||
/// For use when attaching/re-attaching: update the generation stored in this
|
||||
/// structure. If we were in a secondary state, promote to attached (posession
|
||||
/// of a fresh generation implies this).
|
||||
pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
|
||||
pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
|
||||
match &mut self.mode {
|
||||
LocationMode::Attached(attach_conf) => {
|
||||
attach_conf.generation = generation;
|
||||
attach_conf.attach_mode = mode;
|
||||
}
|
||||
LocationMode::Secondary(_) => {
|
||||
// We are promoted to attached by the control plane's re-attach response
|
||||
self.mode = LocationMode::Attached(AttachedLocationConfig {
|
||||
generation,
|
||||
attach_mode: AttachmentMode::Single,
|
||||
attach_mode: mode,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,10 +4,11 @@
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::{LocationConfigMode, ShardParameters};
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use pageserver_api::upcall_api::ReAttachResponseTenant;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -124,6 +125,46 @@ pub(crate) enum ShardSelector {
|
||||
Page(Key),
|
||||
}
|
||||
|
||||
/// A convenience for use with the re_attach ControlPlaneClient function: rather
|
||||
/// than the serializable struct, we build this enum that encapsulates
|
||||
/// the invariant that attached tenants always have generations.
|
||||
///
|
||||
/// This represents the subset of a LocationConfig that we receive during re-attach.
|
||||
pub(crate) enum TenantStartupMode {
|
||||
Attached((AttachmentMode, Generation)),
|
||||
Secondary,
|
||||
}
|
||||
|
||||
impl TenantStartupMode {
|
||||
/// Return the generation & mode that should be used when starting
|
||||
/// this tenant.
|
||||
///
|
||||
/// If this returns None, the re-attach struct is in an invalid state and
|
||||
/// should be ignored in the response.
|
||||
fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option<Self> {
|
||||
match (rart.mode, rart.gen) {
|
||||
(LocationConfigMode::Detached, _) => None,
|
||||
(LocationConfigMode::Secondary, _) => Some(Self::Secondary),
|
||||
(LocationConfigMode::AttachedMulti, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
|
||||
}
|
||||
(LocationConfigMode::AttachedSingle, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
|
||||
}
|
||||
(LocationConfigMode::AttachedStale, Some(g)) => {
|
||||
Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!(
|
||||
"Received invalid re-attach state for tenant {}: {rart:?}",
|
||||
rart.id
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantsMap {
|
||||
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
||||
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
||||
@@ -270,7 +311,7 @@ pub struct TenantManager {
|
||||
|
||||
fn emergency_generations(
|
||||
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
||||
) -> HashMap<TenantShardId, Generation> {
|
||||
) -> HashMap<TenantShardId, TenantStartupMode> {
|
||||
tenant_confs
|
||||
.iter()
|
||||
.filter_map(|(tid, lc)| {
|
||||
@@ -278,12 +319,15 @@ fn emergency_generations(
|
||||
Ok(lc) => lc,
|
||||
Err(_) => return None,
|
||||
};
|
||||
let gen = match &lc.mode {
|
||||
LocationMode::Attached(alc) => Some(alc.generation),
|
||||
LocationMode::Secondary(_) => None,
|
||||
};
|
||||
|
||||
gen.map(|g| (*tid, g))
|
||||
Some((
|
||||
*tid,
|
||||
match &lc.mode {
|
||||
LocationMode::Attached(alc) => {
|
||||
TenantStartupMode::Attached((alc.attach_mode, alc.generation))
|
||||
}
|
||||
LocationMode::Secondary(_) => TenantStartupMode::Secondary,
|
||||
},
|
||||
))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
@@ -293,7 +337,7 @@ async fn init_load_generations(
|
||||
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
|
||||
resources: &TenantSharedResources,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
|
||||
) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
|
||||
let generations = if conf.control_plane_emergency_mode {
|
||||
error!(
|
||||
"Emergency mode! Tenants will be attached unsafely using their last known generation"
|
||||
@@ -303,7 +347,12 @@ async fn init_load_generations(
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach(conf).await {
|
||||
Ok(tenants) => tenants,
|
||||
Ok(tenants) => tenants
|
||||
.into_iter()
|
||||
.flat_map(|(id, rart)| {
|
||||
TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm))
|
||||
})
|
||||
.collect(),
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
anyhow::bail!("Shut down while waiting for control plane re-attach response")
|
||||
}
|
||||
@@ -321,9 +370,17 @@ async fn init_load_generations(
|
||||
// Must only do this if remote storage is enabled, otherwise deletion queue
|
||||
// is not running and channel push will fail.
|
||||
if resources.remote_storage.is_some() {
|
||||
resources
|
||||
.deletion_queue_client
|
||||
.recover(generations.clone())?;
|
||||
let attached_tenants = generations
|
||||
.iter()
|
||||
.flat_map(|(id, start_mode)| {
|
||||
match start_mode {
|
||||
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
|
||||
TenantStartupMode::Secondary => None,
|
||||
}
|
||||
.map(|gen| (*id, *gen))
|
||||
})
|
||||
.collect();
|
||||
resources.deletion_queue_client.recover(attached_tenants)?;
|
||||
}
|
||||
|
||||
Ok(Some(generations))
|
||||
@@ -489,9 +546,8 @@ pub async fn init_tenant_mgr(
|
||||
// Scan local filesystem for attached tenants
|
||||
let tenant_configs = init_load_tenant_configs(conf).await?;
|
||||
|
||||
// Determine which tenants are to be attached
|
||||
let tenant_generations =
|
||||
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||
// Determine which tenants are to be secondary or attached, and in which generation
|
||||
let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||
|
||||
tracing::info!(
|
||||
"Attaching {} tenants at startup, warming up {} at a time",
|
||||
@@ -521,97 +577,102 @@ pub async fn init_tenant_mgr(
|
||||
}
|
||||
};
|
||||
|
||||
let generation = if let Some(generations) = &tenant_generations {
|
||||
// FIXME: if we were attached, and get demoted to secondary on re-attach, we
|
||||
// don't have a place to get a config.
|
||||
// (https://github.com/neondatabase/neon/issues/5377)
|
||||
const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
|
||||
SecondaryLocationConfig { warm: true };
|
||||
|
||||
// Update the location config according to the re-attach response
|
||||
if let Some(tenant_modes) = &tenant_modes {
|
||||
// We have a generation map: treat it as the authority for whether
|
||||
// this tenant is really attached.
|
||||
if let Some(gen) = generations.get(&tenant_shard_id) {
|
||||
if let LocationMode::Attached(attached) = &location_conf.mode {
|
||||
if attached.generation > *gen {
|
||||
match tenant_modes.get(&tenant_shard_id) {
|
||||
None => {
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||
);
|
||||
}
|
||||
|
||||
// We deleted local content: move on to next tenant, don't try and spawn this one.
|
||||
continue;
|
||||
}
|
||||
Some(TenantStartupMode::Secondary) => {
|
||||
if !matches!(location_conf.mode, LocationMode::Secondary(_)) {
|
||||
location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
|
||||
}
|
||||
}
|
||||
Some(TenantStartupMode::Attached((attach_mode, generation))) => {
|
||||
let old_gen_higher = match &location_conf.mode {
|
||||
LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: old_generation,
|
||||
attach_mode: _attach_mode,
|
||||
}) => {
|
||||
if old_generation > generation {
|
||||
Some(old_generation)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
if let Some(old_generation) = old_gen_higher {
|
||||
tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
attached.generation
|
||||
"Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
|
||||
old_generation
|
||||
);
|
||||
|
||||
// We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
|
||||
// local disk content: demote to secondary rather than detaching.
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf.clone(),
|
||||
&SecondaryLocationConfig { warm: false },
|
||||
)),
|
||||
);
|
||||
location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
|
||||
} else {
|
||||
location_conf.attach_in_generation(*attach_mode, *generation);
|
||||
}
|
||||
}
|
||||
*gen
|
||||
} else {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Secondary(secondary_config) => {
|
||||
// We do not require the control plane's permission for secondary mode
|
||||
// tenants, because they do no remote writes and hence require no
|
||||
// generation number
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf,
|
||||
secondary_config,
|
||||
)),
|
||||
);
|
||||
}
|
||||
LocationMode::Attached(_) => {
|
||||
// TODO: augment re-attach API to enable the control plane to
|
||||
// instruct us about secondary attachments. That way, instead of throwing
|
||||
// away local state, we can gracefully fall back to secondary here, if the control
|
||||
// plane tells us so.
|
||||
// (https://github.com/neondatabase/neon/issues/5377)
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
|
||||
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Legacy mode: no generation information, any tenant present
|
||||
// on local disk may activate
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
|
||||
Generation::none()
|
||||
};
|
||||
|
||||
// Presence of a generation number implies attachment: attach the tenant
|
||||
// if it wasn't already, and apply the generation number.
|
||||
location_conf.attach_in_generation(generation);
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
match tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
|
||||
let slot = match location_conf.mode {
|
||||
LocationMode::Attached(attached_conf) => {
|
||||
match tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => TenantSlot::Attached(tenant),
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
}
|
||||
}
|
||||
LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
location_conf.tenant_conf,
|
||||
&secondary_conf,
|
||||
)),
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, slot);
|
||||
}
|
||||
|
||||
info!("Processed {} local tenants at startup", tenants.len());
|
||||
@@ -2142,7 +2203,7 @@ pub(crate) async fn load_tenant(
|
||||
|
||||
let mut location_conf =
|
||||
Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
|
||||
location_conf.attach_in_generation(generation);
|
||||
location_conf.attach_in_generation(AttachmentMode::Single, generation);
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ from fixtures.pageserver.utils import (
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.remote_storage import RemoteStorageKind, s3_storage
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.types import TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import run_pg_bench_small, wait_until
|
||||
from mypy_boto3_s3.type_defs import (
|
||||
ObjectTypeDef,
|
||||
@@ -948,3 +948,65 @@ def test_sharding_service_heartbeats(
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
wait_until(10, 1, storage_controller_consistent)
|
||||
|
||||
|
||||
def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Exercise the behavior of the /re-attach endpoint on pageserver startup when
|
||||
pageservers have a mixture of attached and secondary locations
|
||||
"""
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
# We'll have two tenants.
|
||||
tenant_a = TenantId.generate()
|
||||
env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}')
|
||||
tenant_b = TenantId.generate()
|
||||
env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}')
|
||||
|
||||
# Each pageserver will have one attached and one secondary location
|
||||
env.storage_controller.tenant_shard_migrate(
|
||||
TenantShardId(tenant_a, 0, 0), env.pageservers[0].id
|
||||
)
|
||||
env.storage_controller.tenant_shard_migrate(
|
||||
TenantShardId(tenant_b, 0, 0), env.pageservers[1].id
|
||||
)
|
||||
|
||||
# Hard-fail a pageserver
|
||||
victim_ps = env.pageservers[1]
|
||||
survivor_ps = env.pageservers[0]
|
||||
victim_ps.stop(immediate=True)
|
||||
|
||||
# Heatbeater will notice it's offline, and consequently attachments move to the other pageserver
|
||||
def failed_over():
|
||||
locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
log.info(f"locations: {locations}")
|
||||
assert len(locations) == 2
|
||||
assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations)
|
||||
|
||||
# We could pre-empty this by configuring the node to Offline, but it's preferable to test
|
||||
# the realistic path we would take when a node restarts uncleanly.
|
||||
# The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
|
||||
wait_until(30, 1, failed_over)
|
||||
|
||||
reconciles_before_restart = env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
||||
)
|
||||
|
||||
# Restart the failed pageserver
|
||||
victim_ps.start()
|
||||
|
||||
# We expect that the re-attach call correctly tipped off the pageserver that its locations
|
||||
# are all secondaries now.
|
||||
locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
assert len(locations) == 2
|
||||
assert all(loc[1]["mode"] == "Secondary" for loc in locations)
|
||||
|
||||
# We expect that this situation resulted from the re_attach call, and not any explicit
|
||||
# Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
|
||||
reconciles_after_restart = env.storage_controller.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
||||
)
|
||||
assert reconciles_after_restart == reconciles_before_restart
|
||||
|
||||
Reference in New Issue
Block a user