mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-02 04:50:38 +00:00
storcon: make heartbeats restart aware (#8222)
## Problem Re-attach blocks the pageserver http server from starting up. Hence, it can't reply to heartbeats until that's done. This makes the storage controller mark the node off-line (not good). We worked around this by setting the interval after which nodes are marked offline to 5 minutes. This isn't a long term solution. ## Summary of changes * Introduce a new `NodeAvailability` state: `WarmingUp`. This state models the following time interval: * From receiving the re-attach request until the pageserver replies to the first heartbeat post re-attach * The heartbeat delta generator becomes aware of this state and uses a separate longer interval * Flag `max-warming-up-interval` now models the longer timeout and `max-offline-interval` the shorter one to match the names of the states Closes https://github.com/neondatabase/neon/issues/7552
This commit is contained in:
@@ -22,7 +22,8 @@ struct HeartbeaterTask {
|
||||
|
||||
state: HashMap<NodeId, PageserverState>,
|
||||
|
||||
max_unavailable_interval: Duration,
|
||||
max_offline_interval: Duration,
|
||||
max_warming_up_interval: Duration,
|
||||
jwt_token: Option<String>,
|
||||
}
|
||||
|
||||
@@ -31,7 +32,9 @@ pub(crate) enum PageserverState {
|
||||
Available {
|
||||
last_seen_at: Instant,
|
||||
utilization: PageserverUtilization,
|
||||
new: bool,
|
||||
},
|
||||
WarmingUp {
|
||||
started_at: Instant,
|
||||
},
|
||||
Offline,
|
||||
}
|
||||
@@ -57,12 +60,18 @@ pub(crate) struct Heartbeater {
|
||||
impl Heartbeater {
|
||||
pub(crate) fn new(
|
||||
jwt_token: Option<String>,
|
||||
max_unavailable_interval: Duration,
|
||||
max_offline_interval: Duration,
|
||||
max_warming_up_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
|
||||
let mut heartbeater =
|
||||
HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
|
||||
let mut heartbeater = HeartbeaterTask::new(
|
||||
receiver,
|
||||
jwt_token,
|
||||
max_offline_interval,
|
||||
max_warming_up_interval,
|
||||
cancel,
|
||||
);
|
||||
tokio::task::spawn(async move { heartbeater.run().await });
|
||||
|
||||
Self { sender }
|
||||
@@ -88,14 +97,16 @@ impl HeartbeaterTask {
|
||||
fn new(
|
||||
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||
jwt_token: Option<String>,
|
||||
max_unavailable_interval: Duration,
|
||||
max_offline_interval: Duration,
|
||||
max_warming_up_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
receiver,
|
||||
cancel,
|
||||
state: HashMap::new(),
|
||||
max_unavailable_interval,
|
||||
max_offline_interval,
|
||||
max_warming_up_interval,
|
||||
jwt_token,
|
||||
}
|
||||
}
|
||||
@@ -128,16 +139,15 @@ impl HeartbeaterTask {
|
||||
heartbeat_futs.push({
|
||||
let jwt_token = self.jwt_token.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
let new_node = !self.state.contains_key(node_id);
|
||||
|
||||
// Clone the node and mark it as available such that the request
|
||||
// goes through to the pageserver even when the node is marked offline.
|
||||
// This doesn't impact the availability observed by [`crate::service::Service`].
|
||||
let mut node = node.clone();
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
let mut node_clone = node.clone();
|
||||
node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
|
||||
async move {
|
||||
let response = node
|
||||
let response = node_clone
|
||||
.with_client_retries(
|
||||
|client| async move { client.get_utilization().await },
|
||||
&jwt_token,
|
||||
@@ -161,7 +171,12 @@ impl HeartbeaterTask {
|
||||
PageserverState::Available {
|
||||
last_seen_at: Instant::now(),
|
||||
utilization,
|
||||
new: new_node,
|
||||
}
|
||||
} else if let NodeAvailability::WarmingUp(last_seen_at) =
|
||||
node.get_availability()
|
||||
{
|
||||
PageserverState::WarmingUp {
|
||||
started_at: last_seen_at,
|
||||
}
|
||||
} else {
|
||||
PageserverState::Offline
|
||||
@@ -187,53 +202,67 @@ impl HeartbeaterTask {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut warming_up = 0;
|
||||
let mut offline = 0;
|
||||
for state in new_state.values() {
|
||||
match state {
|
||||
PageserverState::WarmingUp { .. } => {
|
||||
warming_up += 1;
|
||||
}
|
||||
PageserverState::Offline { .. } => offline += 1,
|
||||
PageserverState::Available { .. } => {}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Heartbeat round complete for {} nodes, {} offline",
|
||||
"Heartbeat round complete for {} nodes, {} warming-up, {} offline",
|
||||
new_state.len(),
|
||||
new_state
|
||||
.values()
|
||||
.filter(|s| match s {
|
||||
PageserverState::Available { .. } => {
|
||||
false
|
||||
}
|
||||
PageserverState::Offline => true,
|
||||
})
|
||||
.count()
|
||||
warming_up,
|
||||
offline
|
||||
);
|
||||
|
||||
let mut deltas = Vec::new();
|
||||
let now = Instant::now();
|
||||
for (node_id, ps_state) in new_state {
|
||||
for (node_id, ps_state) in new_state.iter_mut() {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
let entry = self.state.entry(node_id);
|
||||
let entry = self.state.entry(*node_id);
|
||||
|
||||
let mut needs_update = false;
|
||||
match entry {
|
||||
Occupied(ref occ) => match (occ.get(), &ps_state) {
|
||||
(PageserverState::Offline, PageserverState::Offline) => {}
|
||||
(PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
|
||||
if now - *last_seen_at >= self.max_unavailable_interval {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
if now - *last_seen_at >= self.max_offline_interval {
|
||||
deltas.push((*node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
}
|
||||
(_, PageserverState::WarmingUp { started_at }) => {
|
||||
if now - *started_at >= self.max_warming_up_interval {
|
||||
*ps_state = PageserverState::Offline;
|
||||
}
|
||||
|
||||
deltas.push((*node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
_ => {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
deltas.push((*node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
},
|
||||
Vacant(_) => {
|
||||
// This is a new node. Don't generate a delta for it.
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
deltas.push((*node_id, ps_state.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
match entry {
|
||||
Occupied(mut occ) if needs_update => {
|
||||
(*occ.get_mut()) = ps_state;
|
||||
(*occ.get_mut()) = ps_state.clone();
|
||||
}
|
||||
Vacant(vac) => {
|
||||
vac.insert(ps_state);
|
||||
vac.insert(ps_state.clone());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,8 @@ use storage_controller::http::make_router;
|
||||
use storage_controller::metrics::preinitialize_metrics;
|
||||
use storage_controller::persistence::Persistence;
|
||||
use storage_controller::service::{
|
||||
Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
|
||||
Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
|
||||
RECONCILER_CONCURRENCY_DEFAULT,
|
||||
};
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -61,7 +62,12 @@ struct Cli {
|
||||
|
||||
/// Grace period before marking unresponsive pageserver offline
|
||||
#[arg(long)]
|
||||
max_unavailable_interval: Option<humantime::Duration>,
|
||||
max_offline_interval: Option<humantime::Duration>,
|
||||
|
||||
/// More tolerant grace period before marking unresponsive pagserver offline used
|
||||
/// around pageserver restarts
|
||||
#[arg(long)]
|
||||
max_warming_up_interval: Option<humantime::Duration>,
|
||||
|
||||
/// Size threshold for automatically splitting shards (disabled by default)
|
||||
#[arg(long)]
|
||||
@@ -254,10 +260,14 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
jwt_token: secrets.jwt_token,
|
||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||
compute_hook_url: args.compute_hook_url,
|
||||
max_unavailable_interval: args
|
||||
.max_unavailable_interval
|
||||
max_offline_interval: args
|
||||
.max_offline_interval
|
||||
.map(humantime::Duration::into)
|
||||
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
|
||||
.unwrap_or(MAX_OFFLINE_INTERVAL_DEFAULT),
|
||||
max_warming_up_interval: args
|
||||
.max_warming_up_interval
|
||||
.map(humantime::Duration::into)
|
||||
.unwrap_or(MAX_WARMING_UP_INTERVAL_DEFAULT),
|
||||
reconciler_concurrency: args
|
||||
.reconciler_concurrency
|
||||
.unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
|
||||
TenantLocateResponseShard, UtilizationScore,
|
||||
TenantLocateResponseShard,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
@@ -46,6 +46,8 @@ pub(crate) struct Node {
|
||||
/// whether/how they changed it.
|
||||
pub(crate) enum AvailabilityTransition {
|
||||
ToActive,
|
||||
ToWarmingUpFromActive,
|
||||
ToWarmingUpFromOffline,
|
||||
ToOffline,
|
||||
Unchanged,
|
||||
}
|
||||
@@ -90,22 +92,34 @@ impl Node {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_availability(&self) -> NodeAvailability {
|
||||
self.availability
|
||||
}
|
||||
|
||||
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
|
||||
use AvailabilityTransition::*;
|
||||
use NodeAvailability::WarmingUp;
|
||||
|
||||
match self.get_availability_transition(availability) {
|
||||
AvailabilityTransition::ToActive => {
|
||||
ToActive => {
|
||||
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
||||
// users of previously-cloned copies of the node will still see the old cancellation
|
||||
// state. For example, Reconcilers in flight will have to complete and be spawned
|
||||
// again to realize that the node has become available.
|
||||
self.cancel = CancellationToken::new();
|
||||
}
|
||||
AvailabilityTransition::ToOffline => {
|
||||
ToOffline | ToWarmingUpFromActive => {
|
||||
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
||||
self.cancel.cancel();
|
||||
}
|
||||
AvailabilityTransition::Unchanged => {}
|
||||
Unchanged | ToWarmingUpFromOffline => {}
|
||||
}
|
||||
|
||||
if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
|
||||
self.availability = WarmingUp(std::cmp::max(crnt, proposed));
|
||||
} else {
|
||||
self.availability = availability;
|
||||
}
|
||||
self.availability = availability;
|
||||
}
|
||||
|
||||
/// Without modifying the availability of the node, convert the intended availability
|
||||
@@ -120,16 +134,10 @@ impl Node {
|
||||
match (self.availability, availability) {
|
||||
(Offline, Active(_)) => ToActive,
|
||||
(Active(_), Offline) => ToOffline,
|
||||
// Consider the case when the storage controller handles the re-attach of a node
|
||||
// before the heartbeats detect that the node is back online. We still need
|
||||
// [`Service::node_configure`] to attempt reconciliations for shards with an
|
||||
// unknown observed location.
|
||||
// The unsavoury match arm below handles this situation.
|
||||
(Active(lhs), Active(rhs))
|
||||
if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() =>
|
||||
{
|
||||
ToActive
|
||||
}
|
||||
(Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
|
||||
(WarmingUp(_), Offline) => ToOffline,
|
||||
(WarmingUp(_), Active(_)) => ToActive,
|
||||
(Offline, WarmingUp(_)) => ToWarmingUpFromOffline,
|
||||
_ => Unchanged,
|
||||
}
|
||||
}
|
||||
@@ -147,7 +155,7 @@ impl Node {
|
||||
pub(crate) fn may_schedule(&self) -> MaySchedule {
|
||||
let score = match self.availability {
|
||||
NodeAvailability::Active(score) => score,
|
||||
NodeAvailability::Offline => return MaySchedule::No,
|
||||
NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
|
||||
};
|
||||
|
||||
match self.scheduling {
|
||||
|
||||
@@ -100,9 +100,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
/// How long a node may be unresponsive to heartbeats before we declare it offline.
|
||||
/// This must be long enough to cover node restarts as well as normal operations: in future
|
||||
/// it should be separated into distinct timeouts for startup vs. normal operation
|
||||
/// (`<https://github.com/neondatabase/neon/issues/7552>`)
|
||||
pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
|
||||
pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
|
||||
|
||||
/// How long a node may be unresponsive to heartbeats during start up before we declare it
|
||||
/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
|
||||
/// handling of the re-attach response may take a long time and blocks heartbeats from
|
||||
/// being handled on the pageserver side.
|
||||
pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
|
||||
|
||||
#[derive(Clone, strum_macros::Display)]
|
||||
enum TenantOperations {
|
||||
@@ -236,7 +240,12 @@ pub struct Config {
|
||||
/// Grace period within which a pageserver does not respond to heartbeats, but is still
|
||||
/// considered active. Once the grace period elapses, the next heartbeat failure will
|
||||
/// mark the pagseserver offline.
|
||||
pub max_unavailable_interval: Duration,
|
||||
pub max_offline_interval: Duration,
|
||||
|
||||
/// Extended grace period within which pageserver may not respond to heartbeats.
|
||||
/// This extended grace period kicks in after the node has been drained for restart
|
||||
/// and/or upon handling the re-attach request from a node.
|
||||
pub max_warming_up_interval: Duration,
|
||||
|
||||
/// How many Reconcilers may be spawned concurrently
|
||||
pub reconciler_concurrency: usize,
|
||||
@@ -587,6 +596,9 @@ impl Service {
|
||||
online_nodes.insert(node_id, utilization);
|
||||
}
|
||||
PageserverState::Offline => {}
|
||||
PageserverState::WarmingUp { .. } => {
|
||||
unreachable!("Nodes are never marked warming-up during startup reconcile")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -779,63 +791,54 @@ impl Service {
|
||||
let res = self.heartbeater.heartbeat(nodes).await;
|
||||
if let Ok(deltas) = res {
|
||||
for (node_id, state) in deltas.0 {
|
||||
let (new_node, new_availability) = match state {
|
||||
PageserverState::Available {
|
||||
utilization, new, ..
|
||||
} => (
|
||||
new,
|
||||
NodeAvailability::Active(UtilizationScore(
|
||||
utilization.utilization_score,
|
||||
)),
|
||||
let new_availability = match state {
|
||||
PageserverState::Available { utilization, .. } => NodeAvailability::Active(
|
||||
UtilizationScore(utilization.utilization_score),
|
||||
),
|
||||
PageserverState::Offline => (false, NodeAvailability::Offline),
|
||||
PageserverState::WarmingUp { started_at } => {
|
||||
NodeAvailability::WarmingUp(started_at)
|
||||
}
|
||||
PageserverState::Offline => {
|
||||
// The node might have been placed in the WarmingUp state
|
||||
// while the heartbeat round was on-going. Hence, filter out
|
||||
// offline transitions for WarmingUp nodes that are still within
|
||||
// their grace period.
|
||||
if let Ok(NodeAvailability::WarmingUp(started_at)) =
|
||||
self.get_node(node_id).await.map(|n| n.get_availability())
|
||||
{
|
||||
let now = Instant::now();
|
||||
if now - started_at >= self.config.max_warming_up_interval {
|
||||
NodeAvailability::Offline
|
||||
} else {
|
||||
NodeAvailability::WarmingUp(started_at)
|
||||
}
|
||||
} else {
|
||||
NodeAvailability::Offline
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if new_node {
|
||||
// When the heartbeats detect a newly added node, we don't wish
|
||||
// to attempt to reconcile the shards assigned to it. The node
|
||||
// is likely handling it's re-attach response, so reconciling now
|
||||
// would be counterproductive.
|
||||
//
|
||||
// Instead, update the in-memory state with the details learned about the
|
||||
// node.
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, _tenants, scheduler) = locked.parts_mut();
|
||||
// This is the code path for geniune availability transitions (i.e node
|
||||
// goes unavailable and/or comes back online).
|
||||
let res = self
|
||||
.node_configure(node_id, Some(new_availability), None)
|
||||
.await;
|
||||
|
||||
let mut new_nodes = (**nodes).clone();
|
||||
|
||||
if let Some(node) = new_nodes.get_mut(&node_id) {
|
||||
node.set_availability(new_availability);
|
||||
scheduler.node_upsert(node);
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(ApiError::NotFound(_)) => {
|
||||
// This should be rare, but legitimate since the heartbeats are done
|
||||
// on a snapshot of the nodes.
|
||||
tracing::info!("Node {} was not found after heartbeat round", node_id);
|
||||
}
|
||||
|
||||
locked.nodes = Arc::new(new_nodes);
|
||||
} else {
|
||||
// This is the code path for geniune availability transitions (i.e node
|
||||
// goes unavailable and/or comes back online).
|
||||
let res = self
|
||||
.node_configure(node_id, Some(new_availability), None)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(ApiError::NotFound(_)) => {
|
||||
// This should be rare, but legitimate since the heartbeats are done
|
||||
// on a snapshot of the nodes.
|
||||
tracing::info!(
|
||||
"Node {} was not found after heartbeat round",
|
||||
node_id
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
// Transition to active involves reconciling: if a node responds to a heartbeat then
|
||||
// becomes unavailable again, we may get an error here.
|
||||
tracing::error!(
|
||||
"Failed to update node {} after heartbeat round: {}",
|
||||
node_id,
|
||||
err
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
// Transition to active involves reconciling: if a node responds to a heartbeat then
|
||||
// becomes unavailable again, we may get an error here.
|
||||
tracing::error!(
|
||||
"Failed to update node {} after heartbeat round: {}",
|
||||
node_id,
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1152,7 +1155,8 @@ impl Service {
|
||||
let cancel = CancellationToken::new();
|
||||
let heartbeater = Heartbeater::new(
|
||||
config.jwt_token.clone(),
|
||||
config.max_unavailable_interval,
|
||||
config.max_offline_interval,
|
||||
config.max_warming_up_interval,
|
||||
cancel.clone(),
|
||||
);
|
||||
let this = Arc::new(Self {
|
||||
@@ -1664,21 +1668,23 @@ impl Service {
|
||||
| NodeSchedulingPolicy::Filling
|
||||
);
|
||||
|
||||
if !node.is_available() || reset_scheduling {
|
||||
let mut new_nodes = (**nodes).clone();
|
||||
if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
|
||||
if !node.is_available() {
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
}
|
||||
|
||||
if reset_scheduling {
|
||||
node.set_scheduling(NodeSchedulingPolicy::Active);
|
||||
}
|
||||
|
||||
scheduler.node_upsert(node);
|
||||
let new_nodes = Arc::new(new_nodes);
|
||||
*nodes = new_nodes;
|
||||
let mut new_nodes = (**nodes).clone();
|
||||
if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
|
||||
if reset_scheduling {
|
||||
node.set_scheduling(NodeSchedulingPolicy::Active);
|
||||
}
|
||||
|
||||
tracing::info!("Marking {} warming-up on reattach", reattach_req.node_id);
|
||||
node.set_availability(NodeAvailability::WarmingUp(std::time::Instant::now()));
|
||||
|
||||
scheduler.node_upsert(node);
|
||||
let new_nodes = Arc::new(new_nodes);
|
||||
*nodes = new_nodes;
|
||||
} else {
|
||||
tracing::error!(
|
||||
"Reattaching node {} was removed while processing the request",
|
||||
reattach_req.node_id
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4719,6 +4725,15 @@ impl Service {
|
||||
|
||||
// TODO: in the background, we should balance work back onto this pageserver
|
||||
}
|
||||
// No action required for the intermediate unavailable state.
|
||||
// When we transition into active or offline from the unavailable state,
|
||||
// the correct handling above will kick in.
|
||||
AvailabilityTransition::ToWarmingUpFromActive => {
|
||||
tracing::info!("Node {} transition to unavailable from active", node_id);
|
||||
}
|
||||
AvailabilityTransition::ToWarmingUpFromOffline => {
|
||||
tracing::info!("Node {} transition to unavailable from offline", node_id);
|
||||
}
|
||||
AvailabilityTransition::Unchanged => {
|
||||
tracing::debug!("Node {} no availability change during config", node_id);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user