storcon: implement graceful leader cutover

storcon: add start-up sequence utilities
storcon: refactor building of observed state at start-up
2026-05-28 18:40:38 +00:00 · 2024-07-30 17:58:18 +01:00 · 2024-07-30 17:58:17 +01:00 · 2024-07-30 17:57:09 +01:00 · 2024-07-30 17:57:09 +01:00 · 2024-07-30 17:57:08 +01:00
20 changed files with 601 additions and 145 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1744,6 +1744,18 @@ dependencies = [
 "const-random",
 ]

+[[package]]
+name = "dns-lookup"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5766087c2235fec47fafa4cfecc81e494ee679d0fd4a59887ea0919bfb0e4fc"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "socket2 0.5.5",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "dsl_auto_type"
 version = "0.1.1"
@@ -5724,6 +5736,7 @@ dependencies = [
 "control_plane",
 "diesel",
 "diesel_migrations",
+ "dns-lookup",
 "fail",
 "futures",
 "git-version",
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,3 +1,4 @@
+use std::collections::HashSet;
 use std::str::FromStr;
 use std::time::{Duration, Instant};

@@ -304,8 +305,8 @@ pub struct MetadataHealthRecord {

 #[derive(Serialize, Deserialize, Debug)]
 pub struct MetadataHealthUpdateRequest {
-    pub healthy_tenant_shards: Vec<TenantShardId>,
-    pub unhealthy_tenant_shards: Vec<TenantShardId>,
+    pub healthy_tenant_shards: HashSet<TenantShardId>,
+    pub unhealthy_tenant_shards: HashSet<TenantShardId>,
 }

 #[derive(Serialize, Deserialize, Debug)]
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -58,7 +58,7 @@ use std::{
    sync::atomic::AtomicU64,
 };
 use std::{
-    cmp::{max, min, Ordering},
+    cmp::{max, min},
    ops::ControlFlow,
 };
 use std::{
@@ -177,25 +177,6 @@ impl std::fmt::Display for ImageLayerCreationMode {
    }
 }

-/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub(crate) struct Hole {
-    key_range: Range<Key>,
-    coverage_size: usize,
-}
-
-impl Ord for Hole {
-    fn cmp(&self, other: &Self) -> Ordering {
-        other.coverage_size.cmp(&self.coverage_size) // inverse order
-    }
-}
-
-impl PartialOrd for Hole {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
 fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -30,8 +30,8 @@ use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPA
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
+use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
-use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -608,62 +608,93 @@ impl Timeline {
            .read_lock_held_spawn_blocking_startup_micros
            .till_now();

-        // Determine N largest holes where N is number of compacted layers.
-        let max_holes = deltas_to_compact.len();
-        let last_record_lsn = self.get_last_record_lsn();
-        let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
-        let min_hole_coverage_size = 3; // TODO: something more flexible?
-
-        // min-heap (reserve space for one more element added before eviction)
-        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
-        let mut prev: Option<Key> = None;
-
-        let mut all_keys = Vec::new();
-
-        for l in deltas_to_compact.iter() {
-            all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
-        }
-
-        // FIXME: should spawn_blocking the rest of this function
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+        // TODO: replace with streaming k-merge
+        let all_keys = {
+            let mut all_keys = Vec::new();
+            for l in deltas_to_compact.iter() {
+                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
+            }
+            // The current stdlib sorting implementation is designed in a way where it is
+            // particularly fast where the slice is made up of sorted sub-ranges.
+            all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
+            all_keys
+        };

        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();

-        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
-            if let Some(prev_key) = prev {
-                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
-                // compaction is the gap between data key and metadata keys.
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
-                    && !Key::is_metadata_key(&prev_key)
-                {
-                    let key_range = prev_key..next_key;
-                    // Measuring hole by just subtraction of i128 representation of key range boundaries
-                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
-                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
-                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
-                    if coverage_size >= min_hole_coverage_size {
-                        heap.push(Hole {
-                            key_range,
-                            coverage_size,
-                        });
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
+        // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
+        //
+        // A hole is a key range for which this compaction doesn't have any WAL records.
+        // Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range,
+        // cover the hole, but actually don't contain any WAL records for that key range.
+        // The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`).
+        // That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records.
+        //
+        // The algorithm chooses holes as follows.
+        // - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys).
+        // - Filter: min threshold on range length
+        // - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data)
+        //
+        // For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451
+        #[derive(PartialEq, Eq)]
+        struct Hole {
+            key_range: Range<Key>,
+            coverage_size: usize,
+        }
+        let holes: Vec<Hole> = {
+            use std::cmp::Ordering;
+            impl Ord for Hole {
+                fn cmp(&self, other: &Self) -> Ordering {
+                    self.coverage_size.cmp(&other.coverage_size).reverse()
+                }
+            }
+            impl PartialOrd for Hole {
+                fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+                    Some(self.cmp(other))
+                }
+            }
+            let max_holes = deltas_to_compact.len();
+            let last_record_lsn = self.get_last_record_lsn();
+            let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
+            let min_hole_coverage_size = 3; // TODO: something more flexible?
+                                            // min-heap (reserve space for one more element added before eviction)
+            let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
+            let mut prev: Option<Key> = None;
+
+            for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
+                if let Some(prev_key) = prev {
+                    // just first fast filter, do not create hole entries for metadata keys. The last hole in the
+                    // compaction is the gap between data key and metadata keys.
+                    if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
+                        && !Key::is_metadata_key(&prev_key)
+                    {
+                        let key_range = prev_key..next_key;
+                        // Measuring hole by just subtraction of i128 representation of key range boundaries
+                        // has not so much sense, because largest holes will corresponds field1/field2 changes.
+                        // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
+                        // That is why it is better to measure size of hole as number of covering image layers.
+                        let coverage_size =
+                            layers.image_coverage(&key_range, last_record_lsn).len();
+                        if coverage_size >= min_hole_coverage_size {
+                            heap.push(Hole {
+                                key_range,
+                                coverage_size,
+                            });
+                            if heap.len() > max_holes {
+                                heap.pop(); // remove smallest hole
+                            }
                        }
                    }
                }
+                prev = Some(next_key.next());
            }
-            prev = Some(next_key.next());
-        }
+            let mut holes = heap.into_vec();
+            holes.sort_unstable_by_key(|hole| hole.key_range.start);
+            holes
+        };
        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
        drop_rlock(guard);
        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
-        let mut holes = heap.into_vec();
-        holes.sort_unstable_by_key(|hole| hole.key_range.start);
-        let mut next_hole = 0; // index of next hole in holes vector

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
@@ -738,6 +769,7 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
+        let mut next_hole = 0; // index of next hole in holes vector

        for &DeltaEntry {
            key, lsn, ref val, ..
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -53,6 +53,7 @@ diesel = { version = "2.1.4", features = [
 ] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
+dns-lookup = { version = "2.0.4" }

 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
--- a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
@@ -0,0 +1 @@
+DROP TABLE leader;
--- a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
@@ -0,0 +1,6 @@
+CREATE TABLE leader (
+  hostname VARCHAR NOT NULL,
+  port INTEGER NOT NULL,
+  started_at TIMESTAMPTZ NOT NULL,
+  PRIMARY KEY(hostname, port, started_at)
+);
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -10,6 +10,7 @@ mod id_lock_map;
 pub mod metrics;
 mod node;
 mod pageserver_client;
+mod peer_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -81,6 +81,9 @@ struct Cli {
    #[arg(long, default_value = "5s")]
    db_connect_timeout: humantime::Duration,

+    #[arg(long, default_value = "false")]
+    start_as_candidate: bool,
+
    /// `neon_local` sets this to the path of the neon_local repo dir.
    /// Only relevant for testing.
    // TODO: make `cfg(feature = "testing")`
@@ -273,6 +276,8 @@ async fn async_main() -> anyhow::Result<()> {
            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
        split_threshold: args.split_threshold,
        neon_local_repo_dir: args.neon_local_repo_dir,
+        start_as_candidate: args.start_as_candidate,
+        http_service_port: args.listen.port() as i32,
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -0,0 +1,104 @@
+use crate::tenant_shard::ObservedState;
+use pageserver_api::shard::TenantShardId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use tokio_util::sync::CancellationToken;
+
+use reqwest::{StatusCode, Url};
+use utils::{backoff, http::error::HttpErrorBody};
+
+#[derive(Debug, Clone)]
+pub(crate) struct PeerClient {
+    hostname: String,
+    port: i32,
+    jwt: Option<String>,
+    client: reqwest::Client,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum StorageControllerPeerError {
+    #[error("failed to deserialize error response with status code {0} at {1}: {2}")]
+    DeserializationError(StatusCode, Url, reqwest::Error),
+    #[error("storage controller peer API error ({0}): {1}")]
+    ApiError(StatusCode, String),
+    #[error("failed to send HTTP request: {0}")]
+    SendError(reqwest::Error),
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+pub(crate) type Result<T> = std::result::Result<T, StorageControllerPeerError>;
+
+pub(crate) trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
+}
+
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        let url = self.url().to_owned();
+        Err(match self.json::<HttpErrorBody>().await {
+            Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg),
+            Err(err) => StorageControllerPeerError::DeserializationError(status, url, err),
+        })
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub(crate) struct GlobalObservedState(pub(crate) HashMap<TenantShardId, ObservedState>);
+
+impl PeerClient {
+    pub(crate) fn new(hostname: String, port: i32, jwt: Option<String>) -> Self {
+        Self {
+            hostname,
+            port,
+            jwt,
+            client: reqwest::Client::new(),
+        }
+    }
+
+    async fn request_step_down(&self) -> Result<GlobalObservedState> {
+        let uri = format!("{}:{}/control/v1/step_down", self.hostname, self.port);
+        let req = self.client.put(uri);
+        let req = if let Some(jwt) = &self.jwt {
+            req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}"))
+        } else {
+            req
+        };
+
+        let res = req
+            .send()
+            .await
+            .map_err(StorageControllerPeerError::SendError)?;
+        let response = res.error_from_body().await?;
+
+        let status = response.status();
+        let url = response.url().to_owned();
+
+        response
+            .json()
+            .await
+            .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err))
+    }
+
+    pub(crate) async fn step_down(
+        &self,
+        cancel: &CancellationToken,
+    ) -> Result<GlobalObservedState> {
+        backoff::retry(
+            || self.request_step_down(),
+            |_e| false,
+            4,
+            8,
+            "Send step down request",
+            cancel,
+        )
+        .await
+        .ok_or_else(|| StorageControllerPeerError::Cancelled)
+        .and_then(|x| x)
+    }
+}
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -95,6 +95,8 @@ pub(crate) enum DatabaseOperation {
    ListMetadataHealth,
    ListMetadataHealthUnhealthy,
    ListMetadataHealthOutdated,
+    GetLeader,
+    UpdateLeader,
 }

 #[must_use]
@@ -785,6 +787,71 @@ impl Persistence {
        )
        .await
    }
+
+    /// Get the current entry from the `leader` table if one exists.
+    /// It is an error for the table to contain more than one entry.
+    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<LeaderPersistence>> {
+        let mut leader: Vec<LeaderPersistence> = self
+            .with_measured_conn(
+                DatabaseOperation::GetLeader,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::leader::table.load::<LeaderPersistence>(conn)?)
+                },
+            )
+            .await?;
+
+        if leader.len() > 1 {
+            return Err(DatabaseError::Logical(format!(
+                "More than one entry present in the leader table: {leader:?}"
+            )));
+        }
+
+        Ok(leader.pop())
+    }
+
+    /// Update the new leader with compare-exchange semantics. If `prev` does not
+    /// match the current leader entry, then the update is treated as a failure.
+    /// When `prev` is not specified, the update is forced.
+    pub(crate) async fn update_leader(
+        &self,
+        prev: Option<LeaderPersistence>,
+        new: LeaderPersistence,
+    ) -> DatabaseResult<()> {
+        use crate::schema::leader::dsl::*;
+
+        let updated = self
+            .with_measured_conn(
+                DatabaseOperation::UpdateLeader,
+                move |conn| -> DatabaseResult<usize> {
+                    let updated = match &prev {
+                        Some(prev) => diesel::update(leader)
+                            .filter(hostname.eq(prev.hostname.clone()))
+                            .filter(port.eq(prev.port))
+                            .filter(started_at.eq(prev.started_at))
+                            .set((
+                                hostname.eq(new.hostname.clone()),
+                                port.eq(new.port),
+                                started_at.eq(new.started_at),
+                            ))
+                            .execute(conn)?,
+                        None => diesel::insert_into(leader)
+                            .values(new.clone())
+                            .execute(conn)?,
+                    };
+
+                    Ok(updated)
+                },
+            )
+            .await?;
+
+        if updated == 0 {
+            return Err(DatabaseError::Logical(
+                "Leader table update failed".to_string(),
+            ));
+        }
+
+        Ok(())
+    }
 }

 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -910,3 +977,13 @@ impl From<MetadataHealthPersistence> for MetadataHealthRecord {
        }
    }
 }
+
+#[derive(
+    Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone,
+)]
+#[diesel(table_name = crate::schema::leader)]
+pub(crate) struct LeaderPersistence {
+    pub(crate) hostname: String,
+    pub(crate) port: i32,
+    pub(crate) started_at: chrono::DateTime<chrono::Utc>,
+}
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,5 +1,13 @@
 // @generated automatically by Diesel CLI.

+diesel::table! {
+    leader (hostname, port, started_at) {
+        hostname -> Varchar,
+        port -> Int4,
+        started_at -> Timestamptz,
+    }
+}
+
 diesel::table! {
    metadata_health (tenant_id, shard_number, shard_count) {
        tenant_id -> Varchar,
@@ -36,4 +44,4 @@ diesel::table! {
    }
 }

-diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(leader, metadata_health, nodes, tenant_shards,);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -16,7 +16,10 @@ use crate::{
    compute_hook::NotifyError,
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
    metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
+    peer_client::{GlobalObservedState, PeerClient},
+    persistence::{
+        AbortShardSplitStatus, LeaderPersistence, MetadataHealthPersistence, TenantFilter,
+    },
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
@@ -82,7 +85,6 @@ use crate::{
        ReconcilerWaiter, TenantShard,
    },
 };
-use serde::{Deserialize, Serialize};

 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -223,6 +225,7 @@ impl ServiceState {
        tenants: BTreeMap<TenantShardId, TenantShard>,
        scheduler: Scheduler,
        delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
+        initial_leadership_status: LeadershipStatus,
    ) -> Self {
        let status = &crate::metrics::METRICS_REGISTRY
            .metrics_group
@@ -230,15 +233,13 @@ impl ServiceState {

        status.set(
            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
+                status: initial_leadership_status,
            },
            1,
        );

        Self {
-            // TODO: Starting up as Leader is a transient state. Once we enable rolling
-            // upgrades on the k8s side, we should start up as Candidate.
-            leadership_status: LeadershipStatus::Leader,
+            leadership_status: initial_leadership_status,
            tenants,
            nodes: Arc::new(nodes),
            scheduler,
@@ -287,6 +288,33 @@ impl ServiceState {
            0,
        );
    }
+
+    fn become_leader(&mut self) {
+        self.leadership_status = LeadershipStatus::Leader;
+
+        let status = &crate::metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_leadership_status;
+
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::Leader,
+            },
+            1,
+        );
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::SteppedDown,
+            },
+            0,
+        );
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::Candidate,
+            },
+            0,
+        );
+    }
 }

 #[derive(Clone)]
@@ -323,6 +351,10 @@ pub struct Config {

    // TODO: make this cfg(feature  = "testing")
    pub neon_local_repo_dir: Option<PathBuf>,
+
+    pub start_as_candidate: bool,
+
+    pub http_service_port: i32,
 }

 impl From<DatabaseError> for ApiError {
@@ -490,9 +522,10 @@ pub(crate) enum ReconcileResultRequest {
    Stop,
 }

-// TODO: move this into the storcon peer client when that gets added
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
+struct LeaderStepDownState {
+    observed: GlobalObservedState,
+    leader: LeaderPersistence,
+}

 impl Service {
    pub fn get_config(&self) -> &Config {
@@ -504,15 +537,11 @@ impl Service {
    #[instrument(skip_all)]
    async fn startup_reconcile(
        self: &Arc<Service>,
+        leader_step_down_state: Option<LeaderStepDownState>,
        bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
            Result<(), (TenantShardId, NotifyError)>,
        >,
    ) {
-        // For all tenant shards, a vector of observed states on nodes (where None means
-        // indeterminate, same as in [`ObservedStateLocation`])
-        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
-            HashMap::new();
-
        // Startup reconciliation does I/O to other services: whether they
        // are responsive or not, we should aim to finish within our deadline, because:
        // - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -526,26 +555,29 @@ impl Service {
            .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
            .expect("Reconcile timeout is a modest constant");

+        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
+            tracing::info!(
+                "Using observed received from leader at {}:{}",
+                state.leader.hostname,
+                state.leader.port
+            );
+            (state.observed, Some(state.leader))
+        } else {
+            (
+                self.build_global_observed_state(node_scan_deadline).await,
+                None,
+            )
+        };
+
        // Accumulate a list of any tenant locations that ought to be detached
        let mut cleanup = Vec::new();

-        let node_listings = self.scan_node_locations(node_scan_deadline).await;
-        // Send initial heartbeat requests to nodes that replied to the location listing above.
-        let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
-
-        for (node_id, list_response) in node_listings {
-            let tenant_shards = list_response.tenant_shards;
-            tracing::info!(
-                "Received {} shard statuses from pageserver {}, setting it to Active",
-                tenant_shards.len(),
-                node_id
-            );
-
-            for (tenant_shard_id, conf_opt) in tenant_shards {
-                let shard_observations = observed.entry(tenant_shard_id).or_default();
-                shard_observations.push((node_id, conf_opt));
-            }
-        }
+        // Send initial heartbeat requests to all nodes loaded from the database
+        let all_nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;

        // List of tenants for which we will attempt to notify compute of their location at startup
        let mut compute_notifications = Vec::new();
@@ -568,17 +600,16 @@ impl Service {
            }
            *nodes = Arc::new(new_nodes);

-            for (tenant_shard_id, shard_observations) in observed {
-                for (node_id, observed_loc) in shard_observations {
-                    let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
-                        cleanup.push((tenant_shard_id, node_id));
-                        continue;
-                    };
-                    tenant_shard
-                        .observed
-                        .locations
-                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
-                }
+            for (tenant_shard_id, observed_state) in observed.0 {
+                let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
+                    for node_id in observed_state.locations.keys() {
+                        cleanup.push((tenant_shard_id, *node_id));
+                    }
+
+                    continue;
+                };
+
+                tenant_shard.observed = observed_state;
            }

            // Populate each tenant's intent state
@@ -612,6 +643,22 @@ impl Service {
            tenants.len()
        };

+        // Before making any obeservable changes to the cluster, persist self
+        // as leader in database and memory.
+
+        let proposed_leader = self.get_proposed_leader_info();
+
+        if let Err(err) = self
+            .persistence
+            .update_leader(current_leader, proposed_leader)
+            .await
+        {
+            tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
+            std::process::exit(1);
+        }
+
+        self.inner.write().unwrap().become_leader();
+
        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
        // generation_pageserver in the database.

@@ -777,6 +824,31 @@ impl Service {
        node_results
    }

+    async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState {
+        let node_listings = self.scan_node_locations(deadline).await;
+        let mut observed = GlobalObservedState::default();
+
+        for (node_id, location_confs) in node_listings {
+            tracing::info!(
+                "Received {} shard statuses from pageserver {}",
+                location_confs.tenant_shards.len(),
+                node_id
+            );
+
+            for (tid, location_conf) in location_confs.tenant_shards {
+                let entry = observed.0.entry(tid).or_default();
+                entry.locations.insert(
+                    node_id,
+                    ObservedStateLocation {
+                        conf: location_conf,
+                    },
+                );
+            }
+        }
+
+        observed
+    }
+
    /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
    ///
    /// This is safe to run in the background, because if we don't have this TenantShardId in our map of
@@ -1255,12 +1327,20 @@ impl Service {
            config.max_warming_up_interval,
            cancel.clone(),
        );
+
+        let initial_leadership_status = if config.start_as_candidate {
+            LeadershipStatus::Candidate
+        } else {
+            LeadershipStatus::Leader
+        };
+
        let this = Arc::new(Self {
            inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                nodes,
                tenants,
                scheduler,
                delayed_reconcile_rx,
+                initial_leadership_status,
            ))),
            config: config.clone(),
            persistence,
@@ -1329,7 +1409,16 @@ impl Service {
                    return;
                };

-                this.startup_reconcile(bg_compute_notify_result_tx).await;
+                let leadership_status = this.inner.read().unwrap().get_leadership_status();
+                let peer_observed_state = match leadership_status {
+                    LeadershipStatus::Candidate => this.request_step_down().await,
+                    LeadershipStatus::Leader => None,
+                    LeadershipStatus::SteppedDown => unreachable!(),
+                };
+
+                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
+                    .await;
+
                drop(startup_completion);
            }
        });
@@ -6179,4 +6268,88 @@ impl Service {

        global_observed
    }
+
+    /// Collect the details for the current proccess wishing to become the storage controller
+    /// leader.
+    ///
+    /// On failures to discover and resolve the hostname the process is killed and we rely on k8s to retry.
+    fn get_proposed_leader_info(&self) -> LeaderPersistence {
+        let hostname = match dns_lookup::get_hostname() {
+            Ok(name) => name,
+            Err(err) => {
+                tracing::error!("Failed to discover hostname: {err}. Aborting start-up ...");
+                std::process::exit(1);
+            }
+        };
+
+        let mut addrs = match dns_lookup::lookup_host(&hostname) {
+            Ok(addrs) => addrs,
+            Err(err) => {
+                tracing::error!("Failed to resolve hostname: {err}. Aborting start-up ...");
+                std::process::exit(1);
+            }
+        };
+
+        let addr = addrs
+            .pop()
+            .expect("k8s configured hostname always resolves");
+
+        let proposed = LeaderPersistence {
+            hostname: addr.to_string(),
+            port: self.get_config().http_service_port,
+            started_at: chrono::Utc::now(),
+        };
+
+        tracing::info!("Proposed leader details are: {proposed:?}");
+
+        proposed
+    }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    ///
+    /// On failures to query the database or step down error responses the process is killed
+    /// and we rely on k8s to retry.
+    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
+        let leader = match self.persistence.get_leader().await {
+            Ok(leader) => leader,
+            Err(err) => {
+                tracing::error!(
+                    "Failed to query database for current leader: {err}. Aborting start-up ..."
+                );
+                std::process::exit(1);
+            }
+        };
+
+        match leader {
+            Some(leader) => {
+                // TODO: jwt token
+                let client = PeerClient::new(
+                    leader.hostname.to_owned(),
+                    leader.port,
+                    self.config.jwt_token.clone(),
+                );
+                let state = client.step_down(&self.cancel).await;
+                match state {
+                    Ok(state) => Some(LeaderStepDownState {
+                        observed: state,
+                        leader: leader.clone(),
+                    }),
+                    Err(err) => {
+                        tracing::error!(
+                            "Leader ({}:{}) did not respond to step-down request: {}",
+                            leader.hostname,
+                            leader.port,
+                            err
+                        );
+                        None
+                    }
+                }
+            }
+            None => None,
+        }
+    }
 }
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -40,6 +40,11 @@ impl TimelineAnalysis {
            garbage_keys: Vec::new(),
        }
    }
+
+    /// Whether a timeline is healthy.
+    pub(crate) fn is_healthy(&self) -> bool {
+        self.errors.is_empty() && self.warnings.is_empty()
+    }
 }

 pub(crate) async fn branch_cleanup_and_check_errors(
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -32,6 +32,7 @@ use remote_storage::{
 };
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use storage_controller_client::control_api;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
 use tracing::error;
@@ -255,6 +256,12 @@ pub struct ControllerClientConfig {
    pub controller_jwt: String,
 }

+impl ControllerClientConfig {
+    pub fn build_client(self) -> control_api::Client {
+        control_api::Client::new(self.controller_api, Some(self.controller_jwt))
+    }
+}
+
 pub struct ConsoleConfig {
    pub token: String,
    pub base_url: Url,
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,7 +1,8 @@
 use anyhow::{anyhow, bail};
 use camino::Utf8PathBuf;
+use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
 use pageserver_api::shard::TenantShardId;
-use reqwest::Url;
+use reqwest::{Method, Url};
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -61,6 +62,8 @@ enum Command {
        json: bool,
        #[arg(long = "tenant-id", num_args = 0..)]
        tenant_ids: Vec<TenantShardId>,
+        #[arg(long = "post", default_value_t = false)]
+        post_to_storage_controller: bool,
        #[arg(long, default_value = None)]
        /// For safekeeper node_kind only, points to db with debug dump
        dump_db_connstr: Option<String>,
@@ -116,11 +119,20 @@ async fn main() -> anyhow::Result<()> {
        chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
    ));

+    let controller_client_conf = cli.controller_api.map(|controller_api| {
+        ControllerClientConfig {
+            controller_api,
+            // Default to no key: this is a convenience when working in a development environment
+            controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
+        }
+    });
+
    match cli.command {
        Command::ScanMetadata {
            json,
            tenant_ids,
            node_kind,
+            post_to_storage_controller,
            dump_db_connstr,
            dump_db_table,
        } => {
@@ -159,6 +171,9 @@ async fn main() -> anyhow::Result<()> {
                }
                Ok(())
            } else {
+                if controller_client_conf.is_none() && post_to_storage_controller {
+                    return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
+                }
                match scan_metadata(bucket_config.clone(), tenant_ids).await {
                    Err(e) => {
                        tracing::error!("Failed: {e}");
@@ -170,6 +185,21 @@ async fn main() -> anyhow::Result<()> {
                        } else {
                            println!("{}", summary.summary_string());
                        }
+
+                        if post_to_storage_controller {
+                            if let Some(conf) = controller_client_conf {
+                                let controller_client = conf.build_client();
+                                let body = summary.build_health_update_request();
+                                controller_client
+                                    .dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
+                                        Method::POST,
+                                        "control/v1/metadata_health/update".to_string(),
+                                        Some(body),
+                                    )
+                                    .await?;
+                            }
+                        }
+
                        if summary.is_fatal() {
                            Err(anyhow::anyhow!("Fatal scrub errors detected"))
                        } else if summary.is_empty() {
@@ -217,14 +247,6 @@ async fn main() -> anyhow::Result<()> {
            min_age,
            mode,
        } => {
-            let controller_client_conf = cli.controller_api.map(|controller_api| {
-                ControllerClientConfig {
-                    controller_api,
-                    // Default to no key: this is a convenience when working in a development environment
-                    controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
-                }
-            });
-
            match (&controller_client_conf, mode) {
                (Some(_), _) => {
                    // Any mode may run when controller API is set
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -567,13 +567,7 @@ pub async fn pageserver_physical_gc(
    }

    // Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
-    let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
-        let ControllerClientConfig {
-            controller_api,
-            controller_jwt,
-        } = c;
-        control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
-    }) else {
+    let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
        tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
        return Ok(summary);
    };
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
 use aws_sdk_s3::Client;
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::remote_layer_path;
+use pageserver_api::controller_api::MetadataHealthUpdateRequest;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use utils::id::TenantId;
 use utils::shard::ShardCount;

-#[derive(Serialize)]
+#[derive(Serialize, Default)]
 pub struct MetadataSummary {
    tenant_count: usize,
    timeline_count: usize,
@@ -23,19 +24,16 @@ pub struct MetadataSummary {
    with_warnings: HashSet<TenantShardTimelineId>,
    with_orphans: HashSet<TenantShardTimelineId>,
    indices_by_version: HashMap<usize, usize>,
+
+    #[serde(skip)]
+    pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
+    #[serde(skip)]
+    pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
 }

 impl MetadataSummary {
    fn new() -> Self {
-        Self {
-            tenant_count: 0,
-            timeline_count: 0,
-            timeline_shard_count: 0,
-            with_errors: HashSet::new(),
-            with_warnings: HashSet::new(),
-            with_orphans: HashSet::new(),
-            indices_by_version: HashMap::new(),
-        }
+        Self::default()
    }

    fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -54,6 +52,13 @@ impl MetadataSummary {
    }

    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
+        if analysis.is_healthy() {
+            self.healthy_tenant_shards.insert(id.tenant_shard_id);
+        } else {
+            self.healthy_tenant_shards.remove(&id.tenant_shard_id);
+            self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
+        }
+
        if !analysis.errors.is_empty() {
            self.with_errors.insert(*id);
        }
@@ -101,6 +106,13 @@ Index versions: {version_summary}
    pub fn is_empty(&self) -> bool {
        self.timeline_shard_count == 0
    }
+
+    pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
+        MetadataHealthUpdateRequest {
+            healthy_tenant_shards: self.healthy_tenant_shards.clone(),
+            unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
+        }
+    }
 }

 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4401,10 +4401,11 @@ class StorageScrubber:
        assert stdout is not None
        return stdout

-    def scan_metadata(self) -> Any:
-        stdout = self.scrubber_cli(
-            ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
-        )
+    def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
+        args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
+        if post_to_storage_controller:
+            args.append("--post")
+        stdout = self.scrubber_cli(args, timeout=30)

        try:
            return json.loads(stdout)
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -440,10 +440,12 @@ def test_scrubber_scan_pageserver_metadata(
    assert len(index.layer_metadata) > 0
    it = iter(index.layer_metadata.items())

-    scan_summary = env.storage_scrubber.scan_metadata()
+    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
    assert not scan_summary["with_warnings"]
    assert not scan_summary["with_errors"]

+    assert env.storage_controller.metadata_health_is_healthy()
+
    # Delete a layer file that is listed in the index.
    layer, metadata = next(it)
    log.info(f"Deleting {timeline_path}/{layer.to_str()}")
@@ -453,7 +455,17 @@ def test_scrubber_scan_pageserver_metadata(
    )
    log.info(f"delete response: {delete_response}")

-    # Check scan summary. Expect it to be a L0 layer so only emit warnings.
+    # Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
    scan_summary = env.storage_scrubber.scan_metadata()
    log.info(f"{pprint.pformat(scan_summary)}")
    assert len(scan_summary["with_warnings"]) > 0
+
+    assert env.storage_controller.metadata_health_is_healthy()
+
+    # Now post to storage controller, expect seeing one unhealthy health record
+    scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
+    log.info(f"{pprint.pformat(scan_summary)}")
+    assert len(scan_summary["with_warnings"]) > 0
+
+    unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
+    assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
Author	SHA1	Message	Date
Vlad Lazar	71ff8f2433	storcon: implement graceful leader cutover	2024-07-30 17:58:18 +01:00
Vlad Lazar	56c43c4fae	storcon: add start-up sequence utilities	2024-07-30 17:58:17 +01:00
Vlad Lazar	4187657082	storcon: refactor building of observed state at start-up	2024-07-30 17:57:09 +01:00
Vlad Lazar	b690ba5838	storcon: decouple initial heartbeat round from location listing	2024-07-30 17:57:09 +01:00
Vlad Lazar	dd7cafdd97	storcon: add storage controller peer client	2024-07-30 17:57:08 +01:00
Vlad Lazar	c501a10612	storcon: gate starting-up as candidate behind a flag	2024-07-30 17:56:30 +01:00
Vlad Lazar	1fdbef9a44	storcon/persistence: add leader table primitives	2024-07-30 17:56:28 +01:00
Vlad Lazar	3ad1221e55	storcon/diesel: add leader table	2024-07-30 17:54:02 +01:00
Christian Schwarz	d95b46f3f3	cleanup(compact_level0_phase1): some commentary and wrapping into block expressions (#8544 ) Byproduct of scouting done for https://github.com/neondatabase/neon/issues/8184 refs https://github.com/neondatabase/neon/issues/8184	2024-07-30 18:13:18 +02:00
Yuchen Liang	85bef9f05d	feat(scrubber): post `scan_metadata` results to storage controller (#8502 ) Part of #8128, followup to #8480. closes #8421. Enable scrubber to optionally post metadata scan health results to storage controller. Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-07-30 16:07:34 +01:00