A few SK changes (#12577)

# TLDR This PR is a no-op. ## Problem When a SK loses a disk, it must recover all WALs from the very beginning. This may take days/weeks to catch up to the latest WALs for all timelines it owns. ## Summary of changes When SK starts up, if it finds that it has 0 timelines, - it will ask SC for the timeline it owns. - Then, pulls the timeline from its peer safekeepers to restore the WAL redundancy right away. After pulling timeline is complete, it will become active and accepts new WALs. The current impl is a prototype. We can optimize the impl further, e.g., parallel pull timelines. --------- Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
2025-12-22 21:59:59 +00:00 · 2025-07-14 09:37:04 -07:00
parent 2288efae66
commit f67a8a173e
14 changed files with 808 additions and 36 deletions
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -6,10 +6,10 @@
 use std::error::Error as _;

 use http_utils::error::HttpErrorBody;
-use reqwest::{IntoUrl, Method, StatusCode};
+use reqwest::{IntoUrl, Method, Response, StatusCode};
 use safekeeper_api::models::{
    self, PullTimelineRequest, PullTimelineResponse, SafekeeperStatus, SafekeeperUtilization,
-    TimelineCreateRequest, TimelineStatus,
+    TimelineCreateRequest,
 };
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
@@ -161,13 +161,12 @@ impl Client {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-    ) -> Result<TimelineStatus> {
+    ) -> Result<Response> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline/{}",
            self.mgmt_api_endpoint, tenant_id, timeline_id
        );
-        let resp = self.get(&uri).await?;
-        resp.json().await.map_err(Error::ReceiveBody)
+        self.get(&uri).await
    }

    pub async fn snapshot(
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -23,6 +23,7 @@ use safekeeper::defaults::{
    DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
    DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
+use safekeeper::hadron;
 use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
    BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
@@ -252,6 +253,10 @@ struct Args {
    /// Run in development mode (disables security checks)
    #[arg(long, help = "Run in development mode (disables security checks)")]
    dev: bool,
+    /* BEGIN_HADRON */
+    #[arg(long)]
+    enable_pull_timeline_on_startup: bool,
+    /* END_HADRON */
 }

 // Like PathBufValueParser, but allows empty string.
@@ -435,6 +440,11 @@ async fn main() -> anyhow::Result<()> {
        use_https_safekeeper_api: args.use_https_safekeeper_api,
        enable_tls_wal_service_api: args.enable_tls_wal_service_api,
        force_metric_collection_on_scrape: args.force_metric_collection_on_scrape,
+        /* BEGIN_HADRON */
+        advertise_pg_addr_tenant_only: None,
+        enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
+        hcc_base_url: None,
+        /* END_HADRON */
    });

    // initialize sentry if SENTRY_DSN is provided
@@ -529,6 +539,20 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
    // Load all timelines from disk to memory.
    global_timelines.init().await?;

+    /* BEGIN_HADRON */
+    if conf.enable_pull_timeline_on_startup && global_timelines.timelines_count() == 0 {
+        match hadron::hcc_pull_timelines(&conf, global_timelines.clone()).await {
+            Ok(_) => {
+                info!("Successfully pulled all timelines from peer safekeepers");
+            }
+            Err(e) => {
+                error!("Failed to pull timelines from peer safekeepers: {:?}", e);
+                return Err(e);
+            }
+        }
+    }
+    /* END_HADRON */
+
    // Run everything in current thread rt, if asked.
    if conf.current_thread_runtime {
        info!("running in current thread runtime");
--- a/safekeeper/src/hadron.rs
+++ b/safekeeper/src/hadron.rs
@@ -0,0 +1,388 @@
+use pem::Pem;
+use safekeeper_api::models::PullTimelineRequest;
+use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration};
+use tokio::time::sleep;
+use tokio_util::sync::CancellationToken;
+use url::Url;
+use utils::{backoff, id::TenantTimelineId, ip_address};
+
+use anyhow::Result;
+use pageserver_api::controller_api::{
+    AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse,
+};
+
+use crate::{
+    GlobalTimelines, SafeKeeperConf,
+    metrics::{
+        SK_RECOVERY_PULL_TIMELINE_ERRORS, SK_RECOVERY_PULL_TIMELINE_OKS,
+        SK_RECOVERY_PULL_TIMELINE_SECONDS, SK_RECOVERY_PULL_TIMELINES_SECONDS,
+    },
+    pull_timeline,
+    timelines_global_map::DeleteOrExclude,
+};
+
+// Extract information in the SafeKeeperConf to build a NodeRegisterRequest used to register the safekeeper with the HCC.
+fn build_node_registeration_request(
+    conf: &SafeKeeperConf,
+    node_ip_addr: Option<IpAddr>,
+) -> Result<NodeRegisterRequest> {
+    let advertise_pg_addr_with_port = conf
+        .advertise_pg_addr_tenant_only
+        .as_deref()
+        .expect("advertise_pg_addr_tenant_only is required to register with HCC");
+
+    // Extract host/port from the string.
+    let (advertise_host_addr, pg_port_str) = advertise_pg_addr_with_port.split_at(
+        advertise_pg_addr_with_port
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid advertise_pg_addr"))?,
+    );
+    // Need the `[1..]` to remove the leading ':'.
+    let pg_port = pg_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse PG port: {}", e))?;
+
+    let (_, http_port_str) = conf.listen_http_addr.split_at(
+        conf.listen_http_addr
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid listen_http_addr"))?,
+    );
+    let http_port = http_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse HTTP port: {}", e))?;
+
+    Ok(NodeRegisterRequest {
+        node_id: conf.my_id,
+        listen_pg_addr: advertise_host_addr.to_string(),
+        listen_pg_port: pg_port,
+        listen_http_addr: advertise_host_addr.to_string(),
+        listen_http_port: http_port,
+        node_ip_addr,
+        availability_zone_id: AvailabilityZone("todo".to_string()),
+        listen_grpc_addr: None,
+        listen_grpc_port: None,
+        listen_https_port: None,
+    })
+}
+
+// Retrieve the JWT token used for authenticating with HCC from the environment variable.
+// Returns None if the token cannot be retrieved.
+fn get_hcc_auth_token() -> Option<String> {
+    match std::env::var("HCC_AUTH_TOKEN") {
+        Ok(v) => {
+            tracing::info!("Loaded JWT token for authentication with HCC");
+            Some(v)
+        }
+        Err(VarError::NotPresent) => {
+            tracing::info!("No JWT token for authentication with HCC detected");
+            None
+        }
+        Err(_) => {
+            tracing::info!(
+                "Failed to either load to detect non-present HCC_AUTH_TOKEN environment variable"
+            );
+            None
+        }
+    }
+}
+
+async fn send_safekeeper_register_request(
+    request_url: &Url,
+    auth_token: &Option<String>,
+    request: &NodeRegisterRequest,
+) -> Result<()> {
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .post(request_url.clone())
+        .header("Content-Type", "application/json");
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    req_builder
+        .json(&request)
+        .send()
+        .await?
+        .error_for_status()?;
+    Ok(())
+}
+
+/// Registers this safe keeper with the HCC.
+pub async fn register(conf: &SafeKeeperConf) -> Result<()> {
+    match conf.hcc_base_url.as_ref() {
+        None => {
+            tracing::info!("HCC base URL is not set, skipping registration");
+            Ok(())
+        }
+        Some(hcc_base_url) => {
+            // The following operations acquiring the auth token and the node IP address both read environment
+            // variables. It's fine for now as this `register()` function is only called once during startup.
+            // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+            // refactoring things into a "HadronClusterCoordinatorClient" struct.
+            let auth_token = get_hcc_auth_token();
+            let node_ip_addr =
+                ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address.");
+
+            let request = build_node_registeration_request(conf, node_ip_addr)?;
+            let cancel = CancellationToken::new();
+            let request_url = hcc_base_url.clone().join("/hadron-internal/v1/sk")?;
+
+            backoff::retry(
+                || async {
+                    send_safekeeper_register_request(&request_url, &auth_token, &request).await
+                },
+                |_| false,
+                3,
+                u32::MAX,
+                "Calling the HCC safekeeper register API",
+                &cancel,
+            )
+            .await
+            .ok_or(anyhow::anyhow!(
+                "Error in forever retry loop. This error should never be surfaced."
+            ))?
+        }
+    }
+}
+
+async fn safekeeper_list_timelines_request(
+    conf: &SafeKeeperConf,
+) -> Result<pageserver_api::controller_api::SafekeeperTimelinesResponse> {
+    if conf.hcc_base_url.is_none() {
+        tracing::info!("HCC base URL is not set, skipping registration");
+        return Err(anyhow::anyhow!("HCC base URL is not set"));
+    }
+
+    // The following operations acquiring the auth token and the node IP address both read environment
+    // variables. It's fine for now as this `register()` function is only called once during startup.
+    // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+    // refactoring things into a "HadronClusterCoordinatorClient" struct.
+    let auth_token = get_hcc_auth_token();
+    let method = format!("/control/v1/safekeeper/{}/timelines", conf.my_id.0);
+    let request_url = conf.hcc_base_url.as_ref().unwrap().clone().join(&method)?;
+
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .get(request_url.clone())
+        .header("Content-Type", "application/json")
+        .query(&[("id", conf.my_id.0)]);
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    let response = req_builder
+        .send()
+        .await?
+        .error_for_status()?
+        .json::<pageserver_api::controller_api::SafekeeperTimelinesResponse>()
+        .await?;
+    Ok(response)
+}
+
+// Returns true on success, false otherwise.
+pub async fn hcc_pull_timeline(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) -> bool {
+    let mut request = PullTimelineRequest {
+        tenant_id: timeline.tenant_id,
+        timeline_id: timeline.timeline_id,
+        http_hosts: Vec::new(),
+        ignore_tombstone: None,
+    };
+    for host in timeline.peers {
+        if host.0 == conf.my_id.0 {
+            continue;
+        }
+        if let Some(http_host) = nodeid_http.get(&host.0) {
+            request.http_hosts.push(http_host.clone());
+        }
+    }
+
+    let ca_certs = match conf
+        .ssl_ca_certs
+        .iter()
+        .map(Pem::contents)
+        .map(reqwest::Certificate::from_der)
+        .collect::<Result<Vec<_>, _>>()
+    {
+        Ok(result) => result,
+        Err(_) => {
+            return false;
+        }
+    };
+    match pull_timeline::handle_request(
+        request,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines.clone(),
+        true,
+    )
+    .await
+    {
+        Ok(resp) => {
+            tracing::info!(
+                "Completed pulling tenant {} timeline {} from SK {:?}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                resp.safekeeper_host
+            );
+            return true;
+        }
+        Err(e) => {
+            tracing::error!(
+                "Failed to pull tenant {} timeline {} from SK {}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                e
+            );
+
+            let ttid = TenantTimelineId {
+                tenant_id: timeline.tenant_id,
+                timeline_id: timeline.timeline_id,
+            };
+            // Revert the failed timeline pull.
+            // Notice that not found timeline returns OK also.
+            match global_timelines
+                .delete_or_exclude(&ttid, DeleteOrExclude::DeleteLocal)
+                .await
+            {
+                Ok(dr) => {
+                    tracing::info!(
+                        "Deleted tenant {} timeline {} DirExists: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        dr.dir_existed,
+                    );
+                }
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to delete tenant {} timeline {} from global_timelines: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        e
+                    );
+                }
+            }
+        }
+    }
+    false
+}
+
+pub async fn hcc_pull_timeline_till_success(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) {
+    const MAX_PULL_TIMELINE_RETRIES: u64 = 100;
+    for i in 0..MAX_PULL_TIMELINE_RETRIES {
+        if hcc_pull_timeline(
+            timeline.clone(),
+            conf,
+            global_timelines.clone(),
+            nodeid_http,
+        )
+        .await
+        {
+            SK_RECOVERY_PULL_TIMELINE_OKS.inc();
+            return;
+        }
+        tracing::error!(
+            "Failed to pull timeline {} from SK peers, retrying {}/{}",
+            timeline.timeline_id,
+            i + 1,
+            MAX_PULL_TIMELINE_RETRIES
+        );
+        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+    }
+    SK_RECOVERY_PULL_TIMELINE_ERRORS.inc();
+}
+
+pub async fn hcc_pull_timelines(
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+) -> Result<()> {
+    let _timer = SK_RECOVERY_PULL_TIMELINES_SECONDS.start_timer();
+    tracing::info!("Start pulling timelines from SK peers");
+
+    let mut response = SafekeeperTimelinesResponse {
+        timelines: Vec::new(),
+        safekeeper_peers: Vec::new(),
+    };
+    for i in 0..100 {
+        match safekeeper_list_timelines_request(conf).await {
+            Ok(timelines) => {
+                response = timelines;
+            }
+            Err(e) => {
+                tracing::error!("Failed to list timelines from HCC: {}", e);
+                if i == 99 {
+                    return Err(e);
+                }
+            }
+        }
+        sleep(Duration::from_millis(100)).await;
+    }
+
+    let mut nodeid_http = HashMap::new();
+    for sk in response.safekeeper_peers {
+        nodeid_http.insert(
+            sk.node_id.0,
+            format!("http://{}:{}", sk.listen_http_addr, sk.http_port),
+        );
+    }
+    tracing::info!("Received {} timelines from HCC", response.timelines.len());
+    for timeline in response.timelines {
+        let _timer = SK_RECOVERY_PULL_TIMELINE_SECONDS
+            .with_label_values(&[
+                &timeline.tenant_id.to_string(),
+                &timeline.timeline_id.to_string(),
+            ])
+            .start_timer();
+        hcc_pull_timeline_till_success(timeline, conf, global_timelines.clone(), &nodeid_http)
+            .await;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_build_node_registeration_request() {
+        // Test that:
+        // 1. We always extract the host name and port used to register with the HCC from the
+        //    `advertise_pg_addr` if it is set.
+        // 2. The correct ports are extracted from `advertise_pg_addr` and `listen_http_addr`.
+        let mut conf = SafeKeeperConf::dummy();
+        conf.my_id = NodeId(1);
+        conf.advertise_pg_addr_tenant_only =
+            Some("safe-keeper-1.safe-keeper.hadron.svc.cluster.local:5454".to_string());
+        // `listen_pg_addr` and `listen_pg_addr_tenant_only` are not used for node registration. Set them to a different
+        // host and port values and make sure that they don't show up in the node registration request.
+        conf.listen_pg_addr = "0.0.0.0:5456".to_string();
+        conf.listen_pg_addr_tenant_only = Some("0.0.0.0:5456".to_string());
+        conf.listen_http_addr = "0.0.0.0:7676".to_string();
+        let node_ip_addr: Option<IpAddr> = Some("127.0.0.1".parse().unwrap());
+
+        let request = build_node_registeration_request(&conf, node_ip_addr).unwrap();
+        assert_eq!(request.node_id, NodeId(1));
+        assert_eq!(
+            request.listen_pg_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_pg_port, 5454);
+        assert_eq!(
+            request.listen_http_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_http_port, 7676);
+        assert_eq!(
+            request.node_ip_addr,
+            Some(IpAddr::V4("127.0.0.1".parse().unwrap()))
+        );
+    }
+}
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -241,9 +241,14 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
            ApiError::InternalServerError(anyhow::anyhow!("failed to parse CA certs: {e}"))
        })?;

-    let resp =
-        pull_timeline::handle_request(data, conf.sk_auth_token.clone(), ca_certs, global_timelines)
-            .await?;
+    let resp = pull_timeline::handle_request(
+        data,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines,
+        false,
+    )
+    .await?;
    json_response(StatusCode::OK, resp)
 }

--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -10,6 +10,7 @@ use pem::Pem;
 use remote_storage::RemoteStorageConfig;
 use storage_broker::Uri;
 use tokio::runtime::Runtime;
+use url::Url;
 use utils::auth::SwappableJwtAuth;
 use utils::id::NodeId;
 use utils::logging::SecretString;
@@ -20,6 +21,7 @@ pub mod control_file;
 pub mod control_file_upgrade;
 pub mod copy_timeline;
 pub mod debug_dump;
+pub mod hadron;
 pub mod handler;
 pub mod http;
 pub mod metrics;
@@ -100,6 +102,11 @@ pub struct SafeKeeperConf {
    pub advertise_pg_addr: Option<String>,
    pub availability_zone: Option<String>,
    pub no_sync: bool,
+    /* BEGIN_HADRON */
+    pub advertise_pg_addr_tenant_only: Option<String>,
+    pub enable_pull_timeline_on_startup: bool,
+    pub hcc_base_url: Option<Url>,
+    /* END_HADRON */
    pub broker_endpoint: Uri,
    pub broker_keepalive_interval: Duration,
    pub heartbeat_timeout: Duration,
@@ -185,6 +192,11 @@ impl SafeKeeperConf {
            use_https_safekeeper_api: false,
            enable_tls_wal_service_api: false,
            force_metric_collection_on_scrape: true,
+            /* BEGIN_HADRON */
+            advertise_pg_addr_tenant_only: None,
+            enable_pull_timeline_on_startup: false,
+            hcc_base_url: None,
+            /* END_HADRON */
        }
    }
 }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -85,6 +85,43 @@ pub static WAL_STORAGE_LIMIT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_wal_storage_limit_errors counter")
 });
+pub static SK_RECOVERY_PULL_TIMELINE_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_errors",
+        concat!(
+            "Number of errors due to pull_timeline errors during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines runs into error."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_errors counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_OKS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_oks",
+        concat!(
+            "Number of successful pull_timeline during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines is successful."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_oks counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_recovery_pull_timelines_seconds",
+        "Seconds to pull timelines",
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timelines_seconds histogram")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_recovery_pull_timeline_seconds",
+        "Seconds to pull timeline",
+        &["tenant_id", "timeline_id"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_seconds histogram vec")
+});
 /* END_HADRON */
 pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -8,6 +8,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
+use http::StatusCode;
 use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use remote_storage::GenericRemoteStorage;
@@ -21,10 +22,11 @@ use tokio::fs::OpenOptions;
 use tokio::io::AsyncWrite;
 use tokio::sync::mpsc;
 use tokio::task;
+use tokio::time::sleep;
 use tokio_tar::{Archive, Builder, Header};
 use tokio_util::io::{CopyToBytes, SinkWriter};
 use tokio_util::sync::PollSender;
-use tracing::{error, info, instrument};
+use tracing::{error, info, instrument, warn};
 use utils::crashsafe::fsync_async_opt;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::logging::SecretString;
@@ -449,6 +451,7 @@ pub async fn handle_request(
    sk_auth_token: Option<SecretString>,
    ssl_ca_certs: Vec<Certificate>,
    global_timelines: Arc<GlobalTimelines>,
+    wait_for_peer_timeline_status: bool,
 ) -> Result<PullTimelineResponse, ApiError> {
    let existing_tli = global_timelines.get(TenantTimelineId::new(
        request.tenant_id,
@@ -472,37 +475,100 @@ pub async fn handle_request(
    let http_hosts = request.http_hosts.clone();

    // Figure out statuses of potential donors.
-    let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
-        futures::future::join_all(http_hosts.iter().map(|url| async {
-            let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
-            let info = cclient
-                .timeline_status(request.tenant_id, request.timeline_id)
-                .await?;
-            Ok(info)
-        }))
-        .await;
-
    let mut statuses = Vec::new();
-    for (i, response) in responses.into_iter().enumerate() {
-        match response {
-            Ok(status) => {
-                statuses.push((status, i));
-            }
-            Err(e) => {
-                info!("error fetching status from {}: {e}", http_hosts[i]);
+    if !wait_for_peer_timeline_status {
+        let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
+            futures::future::join_all(http_hosts.iter().map(|url| async {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                let resp = cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await?;
+                let info: TimelineStatus = resp
+                    .json()
+                    .await
+                    .context("Failed to deserialize timeline status")
+                    .map_err(|e| mgmt_api::Error::ReceiveErrorBody(e.to_string()))?;
+                Ok(info)
+            }))
+            .await;
+
+        for (i, response) in responses.into_iter().enumerate() {
+            match response {
+                Ok(status) => {
+                    statuses.push((status, i));
+                }
+                Err(e) => {
+                    info!("error fetching status from {}: {e}", http_hosts[i]);
+                }
            }
        }
-    }

-    // Allow missing responses from up to one safekeeper (say due to downtime)
-    // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
-    // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
-    let min_required_successful = (http_hosts.len() - 1).max(1);
-    if statuses.len() < min_required_successful {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "only got {} successful status responses. required: {min_required_successful}",
-            statuses.len()
-        )));
+        // Allow missing responses from up to one safekeeper (say due to downtime)
+        // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
+        // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
+        let min_required_successful = (http_hosts.len() - 1).max(1);
+        if statuses.len() < min_required_successful {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "only got {} successful status responses. required: {min_required_successful}",
+                statuses.len()
+            )));
+        }
+    } else {
+        let mut retry = true;
+        // We must get status from all other peers.
+        // Otherwise, we may run into split-brain scenario.
+        while retry {
+            statuses.clear();
+            retry = false;
+            for (i, url) in http_hosts.iter().enumerate() {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                match cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await
+                {
+                    Ok(resp) => {
+                        if resp.status() == StatusCode::NOT_FOUND {
+                            warn!(
+                                "Timeline {} not found on peer SK {}, no need to pull it",
+                                TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                url
+                            );
+                            return Ok(PullTimelineResponse {
+                                safekeeper_host: None,
+                            });
+                        }
+                        let info: TimelineStatus = resp
+                            .json()
+                            .await
+                            .context("Failed to deserialize timeline status")
+                            .map_err(ApiError::InternalServerError)?;
+                        statuses.push((info, i));
+                    }
+                    Err(e) => {
+                        match e {
+                            // If we get a 404, it means the timeline doesn't exist on this safekeeper.
+                            // We can ignore this error.
+                            mgmt_api::Error::ApiError(status, _)
+                                if status == StatusCode::NOT_FOUND =>
+                            {
+                                warn!(
+                                    "Timeline {} not found on peer SK {}, no need to pull it",
+                                    TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                    url
+                                );
+                                return Ok(PullTimelineResponse {
+                                    safekeeper_host: None,
+                                });
+                            }
+                            _ => {}
+                        }
+                        retry = true;
+                        error!("Failed to get timeline status from {}: {:#}", url, e);
+                    }
+                }
+            }
+            sleep(std::time::Duration::from_millis(100)).await;
+        }
    }

    // Find the most advanced safekeeper
@@ -511,6 +577,12 @@ pub async fn handle_request(
        .max_by_key(|(status, _)| {
            (
                status.acceptor_state.epoch,
+                /* BEGIN_HADRON */
+                // We need to pull from the SK with the highest term.
+                // This is because another compute may come online and vote the same highest term again on the other two SKs.
+                // Then, there will be 2 computes running on the same term.
+                status.acceptor_state.term,
+                /* END_HADRON */
                status.flush_lsn,
                status.commit_lsn,
            )
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -191,6 +191,11 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        use_https_safekeeper_api: false,
        enable_tls_wal_service_api: false,
        force_metric_collection_on_scrape: true,
+        /* BEGIN_HADRON */
+        enable_pull_timeline_on_startup: false,
+        advertise_pg_addr_tenant_only: None,
+        hcc_base_url: None,
+        /* END_HADRON */
    };

    let mut global = GlobalMap::new(disk, conf.clone())?;