storcon: make heartbeats restart aware (#8222)

## Problem
Re-attach blocks the pageserver http server from starting up. Hence, it
can't reply to heartbeats
until that's done. This makes the storage controller mark the node
off-line (not good). We worked
around this by setting the interval after which nodes are marked offline
to 5 minutes. This isn't a
long term solution.

## Summary of changes
* Introduce a new `NodeAvailability` state: `WarmingUp`. This state
models the following time interval:
* From receiving the re-attach request until the pageserver replies to
the first heartbeat post re-attach
* The heartbeat delta generator becomes aware of this state and uses a
separate longer interval
* Flag `max-warming-up-interval` now models the longer timeout and
`max-offline-interval` the shorter one to
match the names of the states

Closes https://github.com/neondatabase/neon/issues/7552
This commit is contained in:
Vlad Lazar
2024-07-25 14:09:12 +01:00
committed by GitHub
parent f76a4e0ad2
commit 9c5ad21341
17 changed files with 508 additions and 179 deletions

View File

@@ -22,7 +22,8 @@ struct HeartbeaterTask {
state: HashMap<NodeId, PageserverState>,
max_unavailable_interval: Duration,
max_offline_interval: Duration,
max_warming_up_interval: Duration,
jwt_token: Option<String>,
}
@@ -31,7 +32,9 @@ pub(crate) enum PageserverState {
Available {
last_seen_at: Instant,
utilization: PageserverUtilization,
new: bool,
},
WarmingUp {
started_at: Instant,
},
Offline,
}
@@ -57,12 +60,18 @@ pub(crate) struct Heartbeater {
impl Heartbeater {
pub(crate) fn new(
jwt_token: Option<String>,
max_unavailable_interval: Duration,
max_offline_interval: Duration,
max_warming_up_interval: Duration,
cancel: CancellationToken,
) -> Self {
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
let mut heartbeater =
HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
let mut heartbeater = HeartbeaterTask::new(
receiver,
jwt_token,
max_offline_interval,
max_warming_up_interval,
cancel,
);
tokio::task::spawn(async move { heartbeater.run().await });
Self { sender }
@@ -88,14 +97,16 @@ impl HeartbeaterTask {
fn new(
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
jwt_token: Option<String>,
max_unavailable_interval: Duration,
max_offline_interval: Duration,
max_warming_up_interval: Duration,
cancel: CancellationToken,
) -> Self {
Self {
receiver,
cancel,
state: HashMap::new(),
max_unavailable_interval,
max_offline_interval,
max_warming_up_interval,
jwt_token,
}
}
@@ -128,16 +139,15 @@ impl HeartbeaterTask {
heartbeat_futs.push({
let jwt_token = self.jwt_token.clone();
let cancel = self.cancel.clone();
let new_node = !self.state.contains_key(node_id);
// Clone the node and mark it as available such that the request
// goes through to the pageserver even when the node is marked offline.
// This doesn't impact the availability observed by [`crate::service::Service`].
let mut node = node.clone();
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
let mut node_clone = node.clone();
node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
async move {
let response = node
let response = node_clone
.with_client_retries(
|client| async move { client.get_utilization().await },
&jwt_token,
@@ -161,7 +171,12 @@ impl HeartbeaterTask {
PageserverState::Available {
last_seen_at: Instant::now(),
utilization,
new: new_node,
}
} else if let NodeAvailability::WarmingUp(last_seen_at) =
node.get_availability()
{
PageserverState::WarmingUp {
started_at: last_seen_at,
}
} else {
PageserverState::Offline
@@ -187,53 +202,67 @@ impl HeartbeaterTask {
}
}
}
let mut warming_up = 0;
let mut offline = 0;
for state in new_state.values() {
match state {
PageserverState::WarmingUp { .. } => {
warming_up += 1;
}
PageserverState::Offline { .. } => offline += 1,
PageserverState::Available { .. } => {}
}
}
tracing::info!(
"Heartbeat round complete for {} nodes, {} offline",
"Heartbeat round complete for {} nodes, {} warming-up, {} offline",
new_state.len(),
new_state
.values()
.filter(|s| match s {
PageserverState::Available { .. } => {
false
}
PageserverState::Offline => true,
})
.count()
warming_up,
offline
);
let mut deltas = Vec::new();
let now = Instant::now();
for (node_id, ps_state) in new_state {
for (node_id, ps_state) in new_state.iter_mut() {
use std::collections::hash_map::Entry::*;
let entry = self.state.entry(node_id);
let entry = self.state.entry(*node_id);
let mut needs_update = false;
match entry {
Occupied(ref occ) => match (occ.get(), &ps_state) {
(PageserverState::Offline, PageserverState::Offline) => {}
(PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
if now - *last_seen_at >= self.max_unavailable_interval {
deltas.push((node_id, ps_state.clone()));
if now - *last_seen_at >= self.max_offline_interval {
deltas.push((*node_id, ps_state.clone()));
needs_update = true;
}
}
(_, PageserverState::WarmingUp { started_at }) => {
if now - *started_at >= self.max_warming_up_interval {
*ps_state = PageserverState::Offline;
}
deltas.push((*node_id, ps_state.clone()));
needs_update = true;
}
_ => {
deltas.push((node_id, ps_state.clone()));
deltas.push((*node_id, ps_state.clone()));
needs_update = true;
}
},
Vacant(_) => {
// This is a new node. Don't generate a delta for it.
deltas.push((node_id, ps_state.clone()));
deltas.push((*node_id, ps_state.clone()));
}
}
match entry {
Occupied(mut occ) if needs_update => {
(*occ.get_mut()) = ps_state;
(*occ.get_mut()) = ps_state.clone();
}
Vacant(vac) => {
vac.insert(ps_state);
vac.insert(ps_state.clone());
}
_ => {}
}

View File

@@ -10,7 +10,8 @@ use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::service::{
Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
RECONCILER_CONCURRENCY_DEFAULT,
};
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
@@ -61,7 +62,12 @@ struct Cli {
/// Grace period before marking unresponsive pageserver offline
#[arg(long)]
max_unavailable_interval: Option<humantime::Duration>,
max_offline_interval: Option<humantime::Duration>,
/// More tolerant grace period before marking unresponsive pagserver offline used
/// around pageserver restarts
#[arg(long)]
max_warming_up_interval: Option<humantime::Duration>,
/// Size threshold for automatically splitting shards (disabled by default)
#[arg(long)]
@@ -254,10 +260,14 @@ async fn async_main() -> anyhow::Result<()> {
jwt_token: secrets.jwt_token,
control_plane_jwt_token: secrets.control_plane_jwt_token,
compute_hook_url: args.compute_hook_url,
max_unavailable_interval: args
.max_unavailable_interval
max_offline_interval: args
.max_offline_interval
.map(humantime::Duration::into)
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
.unwrap_or(MAX_OFFLINE_INTERVAL_DEFAULT),
max_warming_up_interval: args
.max_warming_up_interval
.map(humantime::Duration::into)
.unwrap_or(MAX_WARMING_UP_INTERVAL_DEFAULT),
reconciler_concurrency: args
.reconciler_concurrency
.unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),

View File

@@ -3,7 +3,7 @@ use std::{str::FromStr, time::Duration};
use pageserver_api::{
controller_api::{
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
TenantLocateResponseShard, UtilizationScore,
TenantLocateResponseShard,
},
shard::TenantShardId,
};
@@ -46,6 +46,8 @@ pub(crate) struct Node {
/// whether/how they changed it.
pub(crate) enum AvailabilityTransition {
ToActive,
ToWarmingUpFromActive,
ToWarmingUpFromOffline,
ToOffline,
Unchanged,
}
@@ -90,22 +92,34 @@ impl Node {
}
}
pub(crate) fn get_availability(&self) -> NodeAvailability {
self.availability
}
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
use AvailabilityTransition::*;
use NodeAvailability::WarmingUp;
match self.get_availability_transition(availability) {
AvailabilityTransition::ToActive => {
ToActive => {
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
// users of previously-cloned copies of the node will still see the old cancellation
// state. For example, Reconcilers in flight will have to complete and be spawned
// again to realize that the node has become available.
self.cancel = CancellationToken::new();
}
AvailabilityTransition::ToOffline => {
ToOffline | ToWarmingUpFromActive => {
// Fire the node's cancellation token to cancel any in-flight API requests to it
self.cancel.cancel();
}
AvailabilityTransition::Unchanged => {}
Unchanged | ToWarmingUpFromOffline => {}
}
if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
self.availability = WarmingUp(std::cmp::max(crnt, proposed));
} else {
self.availability = availability;
}
self.availability = availability;
}
/// Without modifying the availability of the node, convert the intended availability
@@ -120,16 +134,10 @@ impl Node {
match (self.availability, availability) {
(Offline, Active(_)) => ToActive,
(Active(_), Offline) => ToOffline,
// Consider the case when the storage controller handles the re-attach of a node
// before the heartbeats detect that the node is back online. We still need
// [`Service::node_configure`] to attempt reconciliations for shards with an
// unknown observed location.
// The unsavoury match arm below handles this situation.
(Active(lhs), Active(rhs))
if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() =>
{
ToActive
}
(Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
(WarmingUp(_), Offline) => ToOffline,
(WarmingUp(_), Active(_)) => ToActive,
(Offline, WarmingUp(_)) => ToWarmingUpFromOffline,
_ => Unchanged,
}
}
@@ -147,7 +155,7 @@ impl Node {
pub(crate) fn may_schedule(&self) -> MaySchedule {
let score = match self.availability {
NodeAvailability::Active(score) => score,
NodeAvailability::Offline => return MaySchedule::No,
NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
};
match self.scheduling {

View File

@@ -100,9 +100,13 @@ pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
/// How long a node may be unresponsive to heartbeats before we declare it offline.
/// This must be long enough to cover node restarts as well as normal operations: in future
/// it should be separated into distinct timeouts for startup vs. normal operation
/// (`<https://github.com/neondatabase/neon/issues/7552>`)
pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30);
/// How long a node may be unresponsive to heartbeats during start up before we declare it
/// offline. This is much more lenient than [`MAX_OFFLINE_INTERVAL_DEFAULT`] since the pageserver's
/// handling of the re-attach response may take a long time and blocks heartbeats from
/// being handled on the pageserver side.
pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300);
#[derive(Clone, strum_macros::Display)]
enum TenantOperations {
@@ -236,7 +240,12 @@ pub struct Config {
/// Grace period within which a pageserver does not respond to heartbeats, but is still
/// considered active. Once the grace period elapses, the next heartbeat failure will
/// mark the pagseserver offline.
pub max_unavailable_interval: Duration,
pub max_offline_interval: Duration,
/// Extended grace period within which pageserver may not respond to heartbeats.
/// This extended grace period kicks in after the node has been drained for restart
/// and/or upon handling the re-attach request from a node.
pub max_warming_up_interval: Duration,
/// How many Reconcilers may be spawned concurrently
pub reconciler_concurrency: usize,
@@ -587,6 +596,9 @@ impl Service {
online_nodes.insert(node_id, utilization);
}
PageserverState::Offline => {}
PageserverState::WarmingUp { .. } => {
unreachable!("Nodes are never marked warming-up during startup reconcile")
}
}
}
}
@@ -779,63 +791,54 @@ impl Service {
let res = self.heartbeater.heartbeat(nodes).await;
if let Ok(deltas) = res {
for (node_id, state) in deltas.0 {
let (new_node, new_availability) = match state {
PageserverState::Available {
utilization, new, ..
} => (
new,
NodeAvailability::Active(UtilizationScore(
utilization.utilization_score,
)),
let new_availability = match state {
PageserverState::Available { utilization, .. } => NodeAvailability::Active(
UtilizationScore(utilization.utilization_score),
),
PageserverState::Offline => (false, NodeAvailability::Offline),
PageserverState::WarmingUp { started_at } => {
NodeAvailability::WarmingUp(started_at)
}
PageserverState::Offline => {
// The node might have been placed in the WarmingUp state
// while the heartbeat round was on-going. Hence, filter out
// offline transitions for WarmingUp nodes that are still within
// their grace period.
if let Ok(NodeAvailability::WarmingUp(started_at)) =
self.get_node(node_id).await.map(|n| n.get_availability())
{
let now = Instant::now();
if now - started_at >= self.config.max_warming_up_interval {
NodeAvailability::Offline
} else {
NodeAvailability::WarmingUp(started_at)
}
} else {
NodeAvailability::Offline
}
}
};
if new_node {
// When the heartbeats detect a newly added node, we don't wish
// to attempt to reconcile the shards assigned to it. The node
// is likely handling it's re-attach response, so reconciling now
// would be counterproductive.
//
// Instead, update the in-memory state with the details learned about the
// node.
let mut locked = self.inner.write().unwrap();
let (nodes, _tenants, scheduler) = locked.parts_mut();
// This is the code path for geniune availability transitions (i.e node
// goes unavailable and/or comes back online).
let res = self
.node_configure(node_id, Some(new_availability), None)
.await;
let mut new_nodes = (**nodes).clone();
if let Some(node) = new_nodes.get_mut(&node_id) {
node.set_availability(new_availability);
scheduler.node_upsert(node);
match res {
Ok(()) => {}
Err(ApiError::NotFound(_)) => {
// This should be rare, but legitimate since the heartbeats are done
// on a snapshot of the nodes.
tracing::info!("Node {} was not found after heartbeat round", node_id);
}
locked.nodes = Arc::new(new_nodes);
} else {
// This is the code path for geniune availability transitions (i.e node
// goes unavailable and/or comes back online).
let res = self
.node_configure(node_id, Some(new_availability), None)
.await;
match res {
Ok(()) => {}
Err(ApiError::NotFound(_)) => {
// This should be rare, but legitimate since the heartbeats are done
// on a snapshot of the nodes.
tracing::info!(
"Node {} was not found after heartbeat round",
node_id
);
}
Err(err) => {
// Transition to active involves reconciling: if a node responds to a heartbeat then
// becomes unavailable again, we may get an error here.
tracing::error!(
"Failed to update node {} after heartbeat round: {}",
node_id,
err
);
}
Err(err) => {
// Transition to active involves reconciling: if a node responds to a heartbeat then
// becomes unavailable again, we may get an error here.
tracing::error!(
"Failed to update node {} after heartbeat round: {}",
node_id,
err
);
}
}
}
@@ -1152,7 +1155,8 @@ impl Service {
let cancel = CancellationToken::new();
let heartbeater = Heartbeater::new(
config.jwt_token.clone(),
config.max_unavailable_interval,
config.max_offline_interval,
config.max_warming_up_interval,
cancel.clone(),
);
let this = Arc::new(Self {
@@ -1664,21 +1668,23 @@ impl Service {
| NodeSchedulingPolicy::Filling
);
if !node.is_available() || reset_scheduling {
let mut new_nodes = (**nodes).clone();
if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
if !node.is_available() {
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
}
if reset_scheduling {
node.set_scheduling(NodeSchedulingPolicy::Active);
}
scheduler.node_upsert(node);
let new_nodes = Arc::new(new_nodes);
*nodes = new_nodes;
let mut new_nodes = (**nodes).clone();
if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
if reset_scheduling {
node.set_scheduling(NodeSchedulingPolicy::Active);
}
tracing::info!("Marking {} warming-up on reattach", reattach_req.node_id);
node.set_availability(NodeAvailability::WarmingUp(std::time::Instant::now()));
scheduler.node_upsert(node);
let new_nodes = Arc::new(new_nodes);
*nodes = new_nodes;
} else {
tracing::error!(
"Reattaching node {} was removed while processing the request",
reattach_req.node_id
);
}
}
@@ -4719,6 +4725,15 @@ impl Service {
// TODO: in the background, we should balance work back onto this pageserver
}
// No action required for the intermediate unavailable state.
// When we transition into active or offline from the unavailable state,
// the correct handling above will kick in.
AvailabilityTransition::ToWarmingUpFromActive => {
tracing::info!("Node {} transition to unavailable from active", node_id);
}
AvailabilityTransition::ToWarmingUpFromOffline => {
tracing::info!("Node {} transition to unavailable from offline", node_id);
}
AvailabilityTransition::Unchanged => {
tracing::debug!("Node {} no availability change during config", node_id);
}