Files
neon/libs/pageserver_api/src/controller_api.rs
HaoyuHuang f67a8a173e A few SK changes (#12577)
# TLDR 
This PR is a no-op. 

## Problem
When a SK loses a disk, it must recover all WALs from the very
beginning. This may take days/weeks to catch up to the latest WALs for
all timelines it owns.

## Summary of changes
When SK starts up,
if it finds that it has 0 timelines,
- it will ask SC for the timeline it owns.
- Then, pulls the timeline from its peer safekeepers to restore the WAL
redundancy right away.

After pulling timeline is complete, it will become active and accepts
new WALs.

The current impl is a prototype. We can optimize the impl further, e.g.,
parallel pull timelines.

---------

Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
2025-07-14 16:37:04 +00:00

680 lines
22 KiB
Rust

use std::collections::{HashMap, HashSet};
use std::fmt::Display;
use std::net::IpAddr;
use std::str::FromStr;
use std::time::{Duration, Instant};
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
/// in [`storage_controller::http`]
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
use crate::shard::{ShardStripeSize, TenantShardId};
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantCreateRequest {
pub new_tenant_id: TenantShardId,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub generation: Option<u32>,
// If omitted, create a single shard with TenantShardId::unsharded()
#[serde(default)]
#[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
pub shard_parameters: ShardParameters,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub placement_policy: Option<PlacementPolicy>,
#[serde(flatten)]
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub generation: u32,
}
#[derive(Serialize, Deserialize)]
pub struct TenantCreateResponse {
pub shards: Vec<TenantCreateResponseShard>,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct NodeRegisterRequest {
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_grpc_addr: Option<String>,
pub listen_grpc_port: Option<u16>,
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_https_port: Option<u16>,
pub availability_zone_id: AvailabilityZone,
// Reachable IP address of the PS/SK registering, if known.
// Hadron Cluster Coordiantor will update the DNS record of the registering node
// with this IP address.
pub node_ip_addr: Option<IpAddr>,
}
#[derive(Serialize, Deserialize)]
pub struct NodeConfigureRequest {
pub node_id: NodeId,
pub availability: Option<NodeAvailabilityWrapper>,
pub scheduling: Option<NodeSchedulingPolicy>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantPolicyRequest {
pub placement: Option<PlacementPolicy>,
pub scheduling: Option<ShardSchedulingPolicy>,
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug, PartialOrd, Ord)]
pub struct AvailabilityZone(pub String);
impl Display for AvailabilityZone {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Serialize, Deserialize)]
pub struct ShardsPreferredAzsRequest {
#[serde(flatten)]
pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
}
#[derive(Serialize, Deserialize)]
pub struct ShardsPreferredAzsResponse {
pub updated: Vec<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantLocateResponseShard {
pub shard_id: TenantShardId,
pub node_id: NodeId,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_grpc_addr: Option<String>,
pub listen_grpc_port: Option<u16>,
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_https_port: Option<u16>,
}
#[derive(Serialize, Deserialize)]
pub struct TenantLocateResponse {
pub shards: Vec<TenantLocateResponseShard>,
pub shard_params: ShardParameters,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantDescribeResponse {
pub tenant_id: TenantId,
pub shards: Vec<TenantDescribeResponseShard>,
pub stripe_size: ShardStripeSize,
pub policy: PlacementPolicy,
pub config: TenantConfig,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantTimelineDescribeResponse {
pub shards: Vec<TimelineInfo>,
#[serde(skip_serializing_if = "Option::is_none")]
pub image_consistent_lsn: Option<Lsn>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct NodeShardResponse {
pub node_id: NodeId,
pub shards: Vec<NodeShard>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct NodeShard {
pub tenant_shard_id: TenantShardId,
/// Whether the shard is observed secondary on a specific node. True = yes, False = no, None = not on this node.
pub is_observed_secondary: Option<bool>,
/// Whether the shard is intended to be a secondary on a specific node. True = yes, False = no, None = not on this node.
pub is_intended_secondary: Option<bool>,
}
#[derive(Serialize, Deserialize)]
pub struct NodeDescribeResponse {
pub id: NodeId,
pub availability: NodeAvailabilityWrapper,
pub scheduling: NodeSchedulingPolicy,
pub availability_zone_id: String,
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_https_port: Option<u16>,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
pub listen_grpc_addr: Option<String>,
pub listen_grpc_port: Option<u16>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantDescribeResponseShard {
pub tenant_shard_id: TenantShardId,
pub node_attached: Option<NodeId>,
pub node_secondary: Vec<NodeId>,
pub last_error: String,
/// A task is currently running to reconcile this tenant's intent state with the state on pageservers
pub is_reconciling: bool,
/// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
pub is_pending_compute_notification: bool,
/// A shard split is currently underway
pub is_splitting: bool,
/// A timeline is being imported into this tenant
pub is_importing: bool,
pub scheduling_policy: ShardSchedulingPolicy,
pub preferred_az_id: Option<String>,
}
/// Migration request for a given tenant shard to a given node.
///
/// Explicitly migrating a particular shard is a low level operation
/// TODO: higher level "Reschedule tenant" operation where the request
/// specifies some constraints, e.g. asking it to get off particular node(s)
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
/// Optionally, callers may specify the node they are migrating _from_, and the server will
/// reject the request if the shard is no longer attached there: this enables writing safer
/// clients that don't risk fighting with some other movement of the shard.
#[serde(default)]
pub origin_node_id: Option<NodeId>,
#[serde(default)]
pub migration_config: MigrationConfig,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
pub struct MigrationConfig {
/// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling
/// score: this is usually not what you want, and if you use this then you'll also need to set the
/// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration.
///
/// Default: false
#[serde(default)]
pub override_scheduler: bool,
/// If true, the migration will be done gracefully by creating a secondary location first and
/// waiting for it to warm up before cutting over. If false, if there is no existing secondary
/// location at the destination, the tenant will be migrated immediately. If the tenant's data
/// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go
/// ahead but run with a cold cache that can severely reduce performance until it warms up.
///
/// When doing a graceful migration, the migration API returns as soon as it is started.
///
/// Default: true
#[serde(default = "default_prewarm")]
pub prewarm: bool,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// overall for secondary warmup before cutting over
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_warmup_timeout: Option<Duration>,
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
/// within each secondary download poll call to pageserver.
#[serde(default)]
#[serde(with = "humantime_serde")]
pub secondary_download_request_timeout: Option<Duration>,
}
fn default_prewarm() -> bool {
true
}
impl Default for MigrationConfig {
fn default() -> Self {
Self {
override_scheduler: false,
prewarm: default_prewarm(),
secondary_warmup_timeout: None,
secondary_download_request_timeout: None,
}
}
}
#[derive(Serialize, Clone, Debug)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
// Normal, happy state
Active(PageserverUtilization),
// Node is warming up, but we expect it to become available soon. Covers
// the time span between the re-attach response being composed on the storage controller
// and the first successful heartbeat after the processing of the re-attach response
// finishes on the pageserver.
WarmingUp(Instant),
// Offline: Tenants shouldn't try to attach here, but they may assume that their
// secondary locations on this node still exist. Newly added nodes are in this
// state until we successfully contact them.
Offline,
}
impl PartialEq for NodeAvailability {
fn eq(&self, other: &Self) -> bool {
use NodeAvailability::*;
matches!(
(self, other),
(Active(_), Active(_)) | (Offline, Offline) | (WarmingUp(_), WarmingUp(_))
)
}
}
impl Eq for NodeAvailability {}
// This wrapper provides serde functionality and it should only be used to
// communicate with external callers which don't know or care about the
// utilisation score of the pageserver it is targeting.
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
pub enum NodeAvailabilityWrapper {
Active,
WarmingUp,
Offline,
}
impl From<NodeAvailabilityWrapper> for NodeAvailability {
fn from(val: NodeAvailabilityWrapper) -> Self {
match val {
// Assume the worst utilisation score to begin with. It will later be updated by
// the heartbeats.
NodeAvailabilityWrapper::Active => {
NodeAvailability::Active(PageserverUtilization::full())
}
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
}
}
}
impl From<NodeAvailability> for NodeAvailabilityWrapper {
fn from(val: NodeAvailability) -> Self {
match val {
NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
NodeAvailability::WarmingUp(_) => NodeAvailabilityWrapper::WarmingUp,
NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
}
}
}
/// Scheduling policy enables us to selectively disable some automatic actions that the
/// controller performs on a tenant shard. This is only set to a non-default value by
/// human intervention, and it is reset to the default value (Active) when the tenant's
/// placement policy is modified away from Attached.
///
/// The typical use of a non-Active scheduling policy is one of:
/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
///
/// If you're not sure which policy to use to pin a shard to its current location, you probably
/// want Pause.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum ShardSchedulingPolicy {
// Normal mode: the tenant's scheduled locations may be updated at will, including
// for non-essential optimization.
Active,
// Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
// For example, this still permits a node's attachment location to change to a secondary in
// response to a node failure, or to assign a new secondary if a node was removed.
Essential,
// No scheduling: leave the shard running wherever it currently is. Even if the shard is
// unavailable, it will not be rescheduled to another node.
Pause,
// No reconciling: we will make no location_conf API calls to pageservers at all. If the
// shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over.
Stop,
}
impl Default for ShardSchedulingPolicy {
fn default() -> Self {
Self::Active
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum NodeLifecycle {
Active,
Deleted,
}
impl FromStr for NodeLifecycle {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"deleted" => Ok(Self::Deleted),
_ => Err(anyhow::anyhow!("Unknown node lifecycle '{s}'")),
}
}
}
impl From<NodeLifecycle> for String {
fn from(value: NodeLifecycle) -> String {
use NodeLifecycle::*;
match value {
Active => "active",
Deleted => "deleted",
}
.to_string()
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum NodeSchedulingPolicy {
Active,
Filling,
Pause,
PauseForRestart,
Draining,
Deleting,
}
impl FromStr for NodeSchedulingPolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self::Active),
"filling" => Ok(Self::Filling),
"pause" => Ok(Self::Pause),
"pause_for_restart" => Ok(Self::PauseForRestart),
"draining" => Ok(Self::Draining),
"deleting" => Ok(Self::Deleting),
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
}
}
}
impl From<NodeSchedulingPolicy> for String {
fn from(value: NodeSchedulingPolicy) -> String {
use NodeSchedulingPolicy::*;
match value {
Active => "active",
Filling => "filling",
Pause => "pause",
PauseForRestart => "pause_for_restart",
Draining => "draining",
Deleting => "deleting",
}
.to_string()
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum SkSchedulingPolicy {
Active,
Activating,
Pause,
Decomissioned,
}
impl FromStr for SkSchedulingPolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"active" => Self::Active,
"activating" => Self::Activating,
"pause" => Self::Pause,
"decomissioned" => Self::Decomissioned,
_ => {
return Err(anyhow::anyhow!(
"Unknown scheduling policy '{s}', try active,pause,decomissioned"
));
}
})
}
}
impl From<SkSchedulingPolicy> for String {
fn from(value: SkSchedulingPolicy) -> String {
use SkSchedulingPolicy::*;
match value {
Active => "active",
Activating => "activating",
Pause => "pause",
Decomissioned => "decomissioned",
}
.to_string()
}
}
/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
/// to create secondary locations.
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
pub enum PlacementPolicy {
/// Normal live state: one attached pageserver and zero or more secondaries.
Attached(usize),
/// Create one secondary mode locations. This is useful when onboarding
/// a tenant, or for an idle tenant that we might want to bring online quickly.
Secondary,
/// Do not attach to any pageservers. This is appropriate for tenants that
/// have been idle for a long time, where we do not mind some delay in making
/// them available in future.
Detached,
}
impl PlacementPolicy {
pub fn want_secondaries(&self) -> usize {
match self {
PlacementPolicy::Attached(secondary_count) => *secondary_count,
PlacementPolicy::Secondary => 1,
PlacementPolicy::Detached => 0,
}
}
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}
/// Metadata health record posted from scrubber.
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthRecord {
pub tenant_shard_id: TenantShardId,
pub healthy: bool,
pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthUpdateRequest {
pub healthy_tenant_shards: HashSet<TenantShardId>,
pub unhealthy_tenant_shards: HashSet<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthUpdateResponse {}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListUnhealthyResponse {
pub unhealthy_tenant_shards: Vec<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedRequest {
#[serde(with = "humantime_serde")]
pub not_scrubbed_for: Duration,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedResponse {
pub health_records: Vec<MetadataHealthRecord>,
}
/// Publicly exposed safekeeper description
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperDescribeResponse {
pub id: NodeId,
pub region_id: String,
/// 1 is special, it means just created (not currently posted to storcon).
/// Zero or negative is not really expected.
/// Otherwise the number from `release-$(number_of_commits_on_branch)` tag.
pub version: i64,
pub host: String,
pub port: i32,
pub http_port: i32,
pub https_port: Option<i32>,
pub availability_zone_id: String,
pub scheduling_policy: SkSchedulingPolicy,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct TimelineSafekeeperPeer {
pub node_id: NodeId,
pub listen_http_addr: String,
pub http_port: i32,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct SCSafekeeperTimeline {
// SC does not know the tenant id.
pub timeline_id: TimelineId,
pub peers: Vec<NodeId>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct SCSafekeeperTimelinesResponse {
pub timelines: Vec<SCSafekeeperTimeline>,
pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct SafekeeperTimeline {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub peers: Vec<NodeId>,
}
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct SafekeeperTimelinesResponse {
pub timelines: Vec<SafekeeperTimeline>,
pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperSchedulingPolicyRequest {
pub scheduling_policy: SkSchedulingPolicy,
}
/// Import request for safekeeper timelines.
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineImportRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub start_lsn: Lsn,
pub sk_set: Vec<NodeId>,
}
#[derive(serde::Serialize, serde::Deserialize, Clone)]
pub struct TimelineSafekeeperMigrateRequest {
pub new_sk_set: Vec<NodeId>,
}
#[cfg(test)]
mod test {
use serde_json;
use super::*;
/// Check stability of PlacementPolicy's serialization
#[test]
fn placement_policy_encoding() -> anyhow::Result<()> {
let v = PlacementPolicy::Attached(1);
let encoded = serde_json::to_string(&v)?;
assert_eq!(encoded, "{\"Attached\":1}");
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
let v = PlacementPolicy::Detached;
let encoded = serde_json::to_string(&v)?;
assert_eq!(encoded, "\"Detached\"");
assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
Ok(())
}
#[test]
fn test_reject_unknown_field() {
let id = TenantId::generate();
let create_request = serde_json::json!({
"new_tenant_id": id.to_string(),
"unknown_field": "unknown_value".to_string(),
});
let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
assert!(
err.to_string().contains("unknown field `unknown_field`"),
"expect unknown field `unknown_field` error, got: {err}"
);
}
/// Check that a minimal migrate request with no config results in the expected default settings
#[test]
fn test_migrate_request_decode_defaults() {
let json = r#"{
"node_id": 123
}"#;
let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap();
assert_eq!(request.node_id, NodeId(123));
assert_eq!(request.origin_node_id, None);
assert!(!request.migration_config.override_scheduler);
assert!(request.migration_config.prewarm);
assert_eq!(request.migration_config.secondary_warmup_timeout, None);
assert_eq!(
request.migration_config.secondary_download_request_timeout,
None
);
}
/// Check that a partially specified migration config results in the expected default settings
#[test]
fn test_migration_config_decode_defaults() {
// Specify just one field of the config
let json = r#"{
}"#;
let config: MigrationConfig = serde_json::from_str(json).unwrap();
// Check each field's expected default value
assert!(!config.override_scheduler);
assert!(config.prewarm);
assert_eq!(config.secondary_warmup_timeout, None);
assert_eq!(config.secondary_download_request_timeout, None);
assert_eq!(config.secondary_warmup_timeout, None);
// Consistency check that the Default impl agrees with our serde defaults
assert_eq!(MigrationConfig::default(), config);
}
}