mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-25 23:29:59 +00:00
## Summary of changes A bunch of no-op changes. --------- Co-authored-by: Vlad Lazar <vlad@neon.tech>
461 lines
17 KiB
Rust
461 lines
17 KiB
Rust
use std::str::FromStr;
|
|
use std::time::Duration;
|
|
|
|
use pageserver_api::controller_api::{
|
|
AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeLifecycle, NodeRegisterRequest,
|
|
NodeSchedulingPolicy, TenantLocateResponseShard,
|
|
};
|
|
use pageserver_api::shard::TenantShardId;
|
|
use pageserver_client::mgmt_api;
|
|
use reqwest::StatusCode;
|
|
use serde::Serialize;
|
|
use tokio_util::sync::CancellationToken;
|
|
use utils::backoff;
|
|
use utils::id::NodeId;
|
|
|
|
use crate::pageserver_client::PageserverClient;
|
|
use crate::persistence::NodePersistence;
|
|
use crate::scheduler::MaySchedule;
|
|
|
|
/// Represents the in-memory description of a Node.
|
|
///
|
|
/// Scheduling statistics are maintened separately in [`crate::scheduler`].
|
|
///
|
|
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
|
|
/// implementation of serialization on this type is only for debug dumps.
|
|
#[derive(Clone, Serialize)]
|
|
pub(crate) struct Node {
|
|
id: NodeId,
|
|
|
|
availability: NodeAvailability,
|
|
scheduling: NodeSchedulingPolicy,
|
|
lifecycle: NodeLifecycle,
|
|
|
|
listen_http_addr: String,
|
|
listen_http_port: u16,
|
|
listen_https_port: Option<u16>,
|
|
|
|
listen_pg_addr: String,
|
|
listen_pg_port: u16,
|
|
listen_grpc_addr: Option<String>,
|
|
listen_grpc_port: Option<u16>,
|
|
|
|
availability_zone_id: AvailabilityZone,
|
|
|
|
// Flag from storcon's config to use https for pageserver admin API.
|
|
// Invariant: if |true|, listen_https_port should contain a value.
|
|
use_https: bool,
|
|
// This cancellation token means "stop any RPCs in flight to this node, and don't start
|
|
// any more". It is not related to process shutdown.
|
|
#[serde(skip)]
|
|
cancel: CancellationToken,
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
const ONE_MILLION: i64 = 1000000;
|
|
|
|
// Converts a pool ID to a large number that can be used to assign unique IDs to pods in StatefulSets.
|
|
/// For example, if pool_id is 1, then the pods have NodeIds 1000000, 1000001, 1000002, etc.
|
|
/// If pool_id is None, then the pods have NodeIds 0, 1, 2, etc.
|
|
#[allow(dead_code)]
|
|
pub fn transform_pool_id(pool_id: Option<i32>) -> i64 {
|
|
match pool_id {
|
|
Some(id) => (id as i64) * ONE_MILLION,
|
|
None => 0,
|
|
}
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
pub fn get_pool_id_from_node_id(node_id: i64) -> i32 {
|
|
(node_id / ONE_MILLION) as i32
|
|
}
|
|
|
|
/// Example pod name: page-server-0-1, safe-keeper-1-0
|
|
#[allow(dead_code)]
|
|
pub fn get_node_id_from_pod_name(pod_name: &str) -> anyhow::Result<NodeId> {
|
|
let parts: Vec<&str> = pod_name.split('-').collect();
|
|
if parts.len() != 4 {
|
|
return Err(anyhow::anyhow!("Invalid pod name: {}", pod_name));
|
|
}
|
|
let pool_id = parts[2].parse::<i32>()?;
|
|
let node_offset = parts[3].parse::<i64>()?;
|
|
let node_id = transform_pool_id(Some(pool_id)) + node_offset;
|
|
|
|
Ok(NodeId(node_id as u64))
|
|
}
|
|
|
|
/// When updating [`Node::availability`] we use this type to indicate to the caller
|
|
/// whether/how they changed it.
|
|
pub(crate) enum AvailabilityTransition {
|
|
ToActive,
|
|
ToWarmingUpFromActive,
|
|
ToWarmingUpFromOffline,
|
|
ToOffline,
|
|
Unchanged,
|
|
}
|
|
|
|
impl Node {
|
|
pub(crate) fn base_url(&self) -> String {
|
|
if self.use_https {
|
|
format!(
|
|
"https://{}:{}",
|
|
self.listen_http_addr,
|
|
self.listen_https_port
|
|
.expect("https port should be specified if use_https is on")
|
|
)
|
|
} else {
|
|
format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
|
|
}
|
|
}
|
|
|
|
pub(crate) fn get_id(&self) -> NodeId {
|
|
self.id
|
|
}
|
|
|
|
#[allow(unused)]
|
|
pub(crate) fn get_availability_zone_id(&self) -> &AvailabilityZone {
|
|
&self.availability_zone_id
|
|
}
|
|
|
|
pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
|
|
self.scheduling
|
|
}
|
|
|
|
pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
|
|
self.scheduling = scheduling
|
|
}
|
|
|
|
pub(crate) fn has_https_port(&self) -> bool {
|
|
self.listen_https_port.is_some()
|
|
}
|
|
|
|
/// Does this registration request match `self`? This is used when deciding whether a registration
|
|
/// request should be allowed to update an existing record with the same node ID.
|
|
pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
|
|
self.id == register_req.node_id
|
|
&& self.listen_http_addr == register_req.listen_http_addr
|
|
&& self.listen_http_port == register_req.listen_http_port
|
|
// Note: HTTPS and gRPC addresses may change, to allow for migrations. See
|
|
// [`Self::need_update`] for more details.
|
|
&& self.listen_pg_addr == register_req.listen_pg_addr
|
|
&& self.listen_pg_port == register_req.listen_pg_port
|
|
&& self.availability_zone_id == register_req.availability_zone_id
|
|
}
|
|
|
|
// Do we need to update an existing record in DB on this registration request?
|
|
pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool {
|
|
// These are checked here, since they may change before we're fully migrated.
|
|
self.listen_https_port != register_req.listen_https_port
|
|
|| self.listen_grpc_addr != register_req.listen_grpc_addr
|
|
|| self.listen_grpc_port != register_req.listen_grpc_port
|
|
}
|
|
|
|
/// For a shard located on this node, populate a response object
|
|
/// with this node's address information.
|
|
pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard {
|
|
TenantLocateResponseShard {
|
|
shard_id,
|
|
node_id: self.id,
|
|
listen_http_addr: self.listen_http_addr.clone(),
|
|
listen_http_port: self.listen_http_port,
|
|
listen_https_port: self.listen_https_port,
|
|
listen_pg_addr: self.listen_pg_addr.clone(),
|
|
listen_pg_port: self.listen_pg_port,
|
|
listen_grpc_addr: self.listen_grpc_addr.clone(),
|
|
listen_grpc_port: self.listen_grpc_port,
|
|
}
|
|
}
|
|
|
|
pub(crate) fn get_availability(&self) -> &NodeAvailability {
|
|
&self.availability
|
|
}
|
|
|
|
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
|
|
use AvailabilityTransition::*;
|
|
use NodeAvailability::WarmingUp;
|
|
|
|
match self.get_availability_transition(&availability) {
|
|
ToActive => {
|
|
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
|
// users of previously-cloned copies of the node will still see the old cancellation
|
|
// state. For example, Reconcilers in flight will have to complete and be spawned
|
|
// again to realize that the node has become available.
|
|
self.cancel = CancellationToken::new();
|
|
}
|
|
ToOffline | ToWarmingUpFromActive => {
|
|
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
|
self.cancel.cancel();
|
|
}
|
|
Unchanged | ToWarmingUpFromOffline => {}
|
|
}
|
|
|
|
if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) {
|
|
self.availability = WarmingUp(std::cmp::max(*crnt, *proposed));
|
|
} else {
|
|
self.availability = availability;
|
|
}
|
|
}
|
|
|
|
/// Without modifying the availability of the node, convert the intended availability
|
|
/// into a description of the transition.
|
|
pub(crate) fn get_availability_transition(
|
|
&self,
|
|
availability: &NodeAvailability,
|
|
) -> AvailabilityTransition {
|
|
use AvailabilityTransition::*;
|
|
use NodeAvailability::*;
|
|
|
|
match (&self.availability, availability) {
|
|
(Offline, Active(_)) => ToActive,
|
|
(Active(_), Offline) => ToOffline,
|
|
(Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
|
|
(WarmingUp(_), Offline) => ToOffline,
|
|
(WarmingUp(_), Active(_)) => ToActive,
|
|
(Offline, WarmingUp(_)) => ToWarmingUpFromOffline,
|
|
_ => Unchanged,
|
|
}
|
|
}
|
|
|
|
/// Whether we may send API requests to this node.
|
|
pub(crate) fn is_available(&self) -> bool {
|
|
// When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds
|
|
// a reference to the original Node's cancellation status. Checking both of these results
|
|
// in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
|
|
// when we cloned it, or if the original Node instance's cancellation token was fired.
|
|
matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
|
|
}
|
|
|
|
/// Is this node elegible to have work scheduled onto it?
|
|
pub(crate) fn may_schedule(&self) -> MaySchedule {
|
|
let utilization = match &self.availability {
|
|
NodeAvailability::Active(u) => u.clone(),
|
|
NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
|
|
};
|
|
|
|
match self.scheduling {
|
|
NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
|
|
NodeSchedulingPolicy::Deleting => MaySchedule::No,
|
|
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
|
NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
|
|
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
|
NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
|
|
}
|
|
}
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub(crate) fn new(
|
|
id: NodeId,
|
|
listen_http_addr: String,
|
|
listen_http_port: u16,
|
|
listen_https_port: Option<u16>,
|
|
listen_pg_addr: String,
|
|
listen_pg_port: u16,
|
|
listen_grpc_addr: Option<String>,
|
|
listen_grpc_port: Option<u16>,
|
|
availability_zone_id: AvailabilityZone,
|
|
use_https: bool,
|
|
) -> anyhow::Result<Self> {
|
|
if use_https && listen_https_port.is_none() {
|
|
anyhow::bail!(
|
|
"cannot create node {id}: \
|
|
https is enabled, but https port is not specified"
|
|
);
|
|
}
|
|
|
|
if listen_grpc_addr.is_some() != listen_grpc_port.is_some() {
|
|
anyhow::bail!("cannot create node {id}: must specify both gRPC address and port");
|
|
}
|
|
|
|
Ok(Self {
|
|
id,
|
|
listen_http_addr,
|
|
listen_http_port,
|
|
listen_https_port,
|
|
listen_pg_addr,
|
|
listen_pg_port,
|
|
listen_grpc_addr,
|
|
listen_grpc_port,
|
|
scheduling: NodeSchedulingPolicy::Active,
|
|
lifecycle: NodeLifecycle::Active,
|
|
availability: NodeAvailability::Offline,
|
|
availability_zone_id,
|
|
use_https,
|
|
cancel: CancellationToken::new(),
|
|
})
|
|
}
|
|
|
|
pub(crate) fn to_persistent(&self) -> NodePersistence {
|
|
NodePersistence {
|
|
node_id: self.id.0 as i64,
|
|
scheduling_policy: self.scheduling.into(),
|
|
lifecycle: self.lifecycle.into(),
|
|
listen_http_addr: self.listen_http_addr.clone(),
|
|
listen_http_port: self.listen_http_port as i32,
|
|
listen_https_port: self.listen_https_port.map(|x| x as i32),
|
|
listen_pg_addr: self.listen_pg_addr.clone(),
|
|
listen_pg_port: self.listen_pg_port as i32,
|
|
listen_grpc_addr: self.listen_grpc_addr.clone(),
|
|
listen_grpc_port: self.listen_grpc_port.map(|port| port as i32),
|
|
availability_zone_id: self.availability_zone_id.0.clone(),
|
|
}
|
|
}
|
|
|
|
pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result<Self> {
|
|
if use_https && np.listen_https_port.is_none() {
|
|
anyhow::bail!(
|
|
"cannot load node {} from persistent: \
|
|
https is enabled, but https port is not specified",
|
|
np.node_id,
|
|
);
|
|
}
|
|
|
|
if np.listen_grpc_addr.is_some() != np.listen_grpc_port.is_some() {
|
|
anyhow::bail!(
|
|
"can't load node {}: must specify both gRPC address and port",
|
|
np.node_id
|
|
);
|
|
}
|
|
|
|
Ok(Self {
|
|
id: NodeId(np.node_id as u64),
|
|
// At startup we consider a node offline until proven otherwise.
|
|
availability: NodeAvailability::Offline,
|
|
scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy)
|
|
.expect("Bad scheduling policy in DB"),
|
|
lifecycle: NodeLifecycle::from_str(&np.lifecycle).expect("Bad lifecycle in DB"),
|
|
listen_http_addr: np.listen_http_addr,
|
|
listen_http_port: np.listen_http_port as u16,
|
|
listen_https_port: np.listen_https_port.map(|x| x as u16),
|
|
listen_pg_addr: np.listen_pg_addr,
|
|
listen_pg_port: np.listen_pg_port as u16,
|
|
listen_grpc_addr: np.listen_grpc_addr,
|
|
listen_grpc_port: np.listen_grpc_port.map(|port| port as u16),
|
|
availability_zone_id: AvailabilityZone(np.availability_zone_id),
|
|
use_https,
|
|
cancel: CancellationToken::new(),
|
|
})
|
|
}
|
|
|
|
/// Wrapper for issuing requests to pageserver management API: takes care of generic
|
|
/// retry/backoff for retryable HTTP status codes.
|
|
///
|
|
/// This will return None to indicate cancellation. Cancellation may happen from
|
|
/// the cancellation token passed in, or from Self's cancellation token (i.e. node
|
|
/// going offline).
|
|
#[allow(clippy::too_many_arguments)]
|
|
pub(crate) async fn with_client_retries<T, O, F>(
|
|
&self,
|
|
mut op: O,
|
|
http_client: &reqwest::Client,
|
|
jwt: &Option<String>,
|
|
warn_threshold: u32,
|
|
max_retries: u32,
|
|
timeout: Duration,
|
|
cancel: &CancellationToken,
|
|
) -> Option<mgmt_api::Result<T>>
|
|
where
|
|
O: FnMut(PageserverClient) -> F,
|
|
F: std::future::Future<Output = mgmt_api::Result<T>>,
|
|
{
|
|
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
|
use mgmt_api::Error::*;
|
|
match e {
|
|
SendRequest(_) | ReceiveBody(_) | ReceiveErrorBody(_) => false,
|
|
ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
|
|
| ApiError(StatusCode::GATEWAY_TIMEOUT, _)
|
|
| ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
|
|
ApiError(_, _) => true,
|
|
Cancelled => true,
|
|
Timeout(_) => false,
|
|
}
|
|
}
|
|
|
|
backoff::retry(
|
|
|| {
|
|
let client = PageserverClient::new(
|
|
self.get_id(),
|
|
http_client.clone(),
|
|
self.base_url(),
|
|
jwt.as_deref(),
|
|
);
|
|
|
|
let node_cancel_fut = self.cancel.cancelled();
|
|
|
|
let op_fut = tokio::time::timeout(timeout, op(client));
|
|
|
|
async {
|
|
tokio::select! {
|
|
r = op_fut => match r {
|
|
Ok(r) => r,
|
|
Err(e) => Err(mgmt_api::Error::Timeout(format!("{e}"))),
|
|
},
|
|
_ = node_cancel_fut => {
|
|
Err(mgmt_api::Error::Cancelled)
|
|
}}
|
|
}
|
|
},
|
|
is_fatal,
|
|
warn_threshold,
|
|
max_retries,
|
|
&format!(
|
|
"Call to node {} ({}) management API",
|
|
self.id,
|
|
self.base_url(),
|
|
),
|
|
cancel,
|
|
)
|
|
.await
|
|
}
|
|
|
|
/// Generate the simplified API-friendly description of a node's state
|
|
pub(crate) fn describe(&self) -> NodeDescribeResponse {
|
|
NodeDescribeResponse {
|
|
id: self.id,
|
|
availability: self.availability.clone().into(),
|
|
scheduling: self.scheduling,
|
|
availability_zone_id: self.availability_zone_id.0.clone(),
|
|
listen_http_addr: self.listen_http_addr.clone(),
|
|
listen_http_port: self.listen_http_port,
|
|
listen_https_port: self.listen_https_port,
|
|
listen_pg_addr: self.listen_pg_addr.clone(),
|
|
listen_pg_port: self.listen_pg_port,
|
|
listen_grpc_addr: self.listen_grpc_addr.clone(),
|
|
listen_grpc_port: self.listen_grpc_port,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Display for Node {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
write!(f, "{} ({})", self.id, self.listen_http_addr)
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Debug for Node {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
write!(f, "{} ({})", self.id, self.listen_http_addr)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use utils::id::NodeId;
|
|
|
|
use crate::node::get_node_id_from_pod_name;
|
|
|
|
#[test]
|
|
fn test_get_node_id_from_pod_name() {
|
|
let pod_name = "page-server-3-12";
|
|
let node_id = get_node_id_from_pod_name(pod_name).unwrap();
|
|
assert_eq!(node_id, NodeId(3000012));
|
|
|
|
let pod_name = "safe-keeper-1-0";
|
|
let node_id = get_node_id_from_pod_name(pod_name).unwrap();
|
|
assert_eq!(node_id, NodeId(1000000));
|
|
|
|
let pod_name = "invalid-pod-name";
|
|
let result = get_node_id_from_pod_name(pod_name);
|
|
assert!(result.is_err());
|
|
}
|
|
}
|