Replace callmemaybe with etcd subscriptions on safekeeper timeline info

2026-01-06 21:12:55 +00:00 · 2022-04-22 13:56:48 +03:00
parent 6623c5b9d5
commit e5cb727572
25 changed files with 1968 additions and 693 deletions
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"

 [features]
 # It is simpler infra-wise to have failpoints enabled by default
-# It shouldn't affect perf in any way because failpoints
+# It shouldn't affect performance in any way because failpoints
 # are not placed in hot code paths
 default = ["failpoints"]
 profiling = ["pprof"]
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -480,6 +480,21 @@ impl PageServerConf {
        if let Some(pitr_interval) = item.get("pitr_interval") {
            t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
        }
+        if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") {
+            t_conf.walreceiver_connect_timeout = Some(parse_toml_duration(
+                "walreceiver_connect_timeout",
+                walreceiver_connect_timeout,
+            )?);
+        }
+        if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") {
+            t_conf.lagging_wal_timeout = Some(parse_toml_duration(
+                "lagging_wal_timeout",
+                lagging_wal_timeout,
+            )?);
+        }
+        if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
+            t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
+        }

        Ok(t_conf)
    }
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -1,3 +1,5 @@
+use std::num::NonZeroU64;
+
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::{
@@ -33,6 +35,9 @@ pub struct TenantCreateRequest {
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
    pub pitr_interval: Option<String>,
+    pub walreceiver_connect_timeout: Option<String>,
+    pub lagging_wal_timeout: Option<String>,
+    pub max_lsn_wal_lag: Option<NonZeroU64>,
 }

 #[serde_as]
@@ -68,6 +73,9 @@ pub struct TenantConfigRequest {
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
    pub pitr_interval: Option<String>,
+    pub walreceiver_connect_timeout: Option<String>,
+    pub lagging_wal_timeout: Option<String>,
+    pub max_lsn_wal_lag: Option<NonZeroU64>,
 }

 impl TenantConfigRequest {
@@ -82,6 +90,21 @@ impl TenantConfigRequest {
            gc_period: None,
            image_creation_threshold: None,
            pitr_interval: None,
+            walreceiver_connect_timeout: None,
+            lagging_wal_timeout: None,
+            max_lsn_wal_lag: None,
        }
    }
 }
+
+/// A WAL receiver's data stored inside the global `WAL_RECEIVERS`.
+/// We keep one WAL receiver active per timeline.
+#[serde_as]
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct WalReceiverEntry {
+    pub wal_producer_connstr: Option<String>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub last_received_msg_lsn: Option<Lsn>,
+    /// the timestamp (in microseconds) of the last received message
+    pub last_received_msg_ts: Option<u128>,
+}
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -229,23 +229,16 @@ async fn wal_receiver_get_handler(request: Request<Body>) -> Result<Response<Bod
    check_permission(&request, Some(tenant_id))?;

    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
+    let wal_receiver_entry = crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id)
+        .instrument(info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id))
+        .await
+        .ok_or_else(|| {
+            ApiError::NotFound(format!(
+                "WAL receiver data not found for tenant {tenant_id} and timeline {timeline_id}"
+            ))
+        })?;

-    let wal_receiver = tokio::task::spawn_blocking(move || {
-        let _enter =
-            info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id).entered();
-
-        crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id)
-    })
-    .await
-    .map_err(ApiError::from_err)?
-    .ok_or_else(|| {
-        ApiError::NotFound(format!(
-            "WAL receiver not found for tenant {} and timeline {}",
-            tenant_id, timeline_id
-        ))
-    })?;
-
-    json_response(StatusCode::OK, wal_receiver)
+    json_response(StatusCode::OK, &wal_receiver_entry)
 }

 async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -402,6 +395,19 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
            Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
    }

+    if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout {
+        tenant_conf.walreceiver_connect_timeout = Some(
+            humantime::parse_duration(&walreceiver_connect_timeout).map_err(ApiError::from_err)?,
+        );
+    }
+    if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout {
+        tenant_conf.lagging_wal_timeout =
+            Some(humantime::parse_duration(&lagging_wal_timeout).map_err(ApiError::from_err)?);
+    }
+    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
+        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
+    }
+
    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
    tenant_conf.compaction_target_size = request_data.compaction_target_size;
    tenant_conf.compaction_threshold = request_data.compaction_threshold;
@@ -449,6 +455,18 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
        tenant_conf.pitr_interval =
            Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
    }
+    if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout {
+        tenant_conf.walreceiver_connect_timeout = Some(
+            humantime::parse_duration(&walreceiver_connect_timeout).map_err(ApiError::from_err)?,
+        );
+    }
+    if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout {
+        tenant_conf.lagging_wal_timeout =
+            Some(humantime::parse_duration(&lagging_wal_timeout).map_err(ApiError::from_err)?);
+    }
+    if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
+        tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
+    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
    tenant_conf.compaction_target_size = request_data.compaction_target_size;
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -25,6 +25,7 @@ use std::collections::{BTreeSet, HashSet};
 use std::fs;
 use std::fs::{File, OpenOptions};
 use std::io::Write;
+use std::num::NonZeroU64;
 use std::ops::{Bound::Included, Deref, Range};
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{self, AtomicBool};
@@ -557,6 +558,27 @@ impl LayeredRepository {
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

+    pub fn get_wal_receiver_connect_timeout(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .walreceiver_connect_timeout
+            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout)
+    }
+
+    pub fn get_lagging_wal_timeout(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .lagging_wal_timeout
+            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout)
+    }
+
+    pub fn get_max_lsn_wal_lag(&self) -> NonZeroU64 {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .max_lsn_wal_lag
+            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag)
+    }
+
    pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) -> Result<()> {
        let mut tenant_conf = self.tenant_conf.write().unwrap();

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -7,7 +7,6 @@
 //     *status* -- show actual info about this pageserver,
 //     *pagestream* -- enter mode where smgr and pageserver talk with their
 //  custom protocol.
-//     *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
 //

 use anyhow::{bail, ensure, Context, Result};
@@ -38,7 +37,6 @@ use crate::repository::Timeline;
 use crate::tenant_mgr;
 use crate::thread_mgr;
 use crate::thread_mgr::ThreadKind;
-use crate::walreceiver;
 use crate::CheckpointConfig;
 use metrics::{register_histogram_vec, HistogramVec};
 use postgres_ffi::xlog_utils::to_pg_timestamp;
@@ -716,30 +714,6 @@ impl postgres_backend::Handler for PageServerHandler {

            // Check that the timeline exists
            self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("callmemaybe ") {
-            // callmemaybe <zenith tenantid as hex string> <zenith timelineid as hex string> <connstr>
-            // TODO lazy static
-            let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) ([[:xdigit:]]+) (.*)$").unwrap();
-            let caps = re
-                .captures(query_string)
-                .with_context(|| format!("invalid callmemaybe: '{}'", query_string))?;
-
-            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
-            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
-            let connstr = caps.get(3).unwrap().as_str().to_owned();
-
-            self.check_permission(Some(tenantid))?;
-
-            let _enter =
-                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
-
-            // Check that the timeline exists
-            tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
-                .context("Cannot load local timeline")?;
-
-            walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?;
-
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -469,6 +469,9 @@ pub mod repo_harness {
                gc_period: Some(tenant_conf.gc_period),
                image_creation_threshold: Some(tenant_conf.image_creation_threshold),
                pitr_interval: Some(tenant_conf.pitr_interval),
+                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
+                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
+                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
            }
        }
    }
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -10,6 +10,7 @@
 //!
 use crate::config::PageServerConf;
 use serde::{Deserialize, Serialize};
+use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::time::Duration;
 use utils::zid::ZTenantId;
@@ -34,6 +35,9 @@ pub mod defaults {
    pub const DEFAULT_GC_PERIOD: &str = "100 s";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
+    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
+    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1_000_000;
 }

 /// Per-tenant configuration options
@@ -68,6 +72,17 @@ pub struct TenantConf {
    // Page versions older than this are garbage collected away.
    #[serde(with = "humantime_serde")]
    pub pitr_interval: Duration,
+    /// Maximum amount of time to wait while opening a connection to receive wal, before erroring.
+    #[serde(with = "humantime_serde")]
+    pub walreceiver_connect_timeout: Duration,
+    /// Considers safekeepers stalled after no WAL updates were received longer than this threshold.
+    /// A stalled safekeeper will be changed to a newer one when it appears.
+    #[serde(with = "humantime_serde")]
+    pub lagging_wal_timeout: Duration,
+    /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold.
+    /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
+    /// to avoid eager reconnects.
+    pub max_lsn_wal_lag: NonZeroU64,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -85,6 +100,11 @@ pub struct TenantConfOpt {
    pub image_creation_threshold: Option<usize>,
    #[serde(with = "humantime_serde")]
    pub pitr_interval: Option<Duration>,
+    #[serde(with = "humantime_serde")]
+    pub walreceiver_connect_timeout: Option<Duration>,
+    #[serde(with = "humantime_serde")]
+    pub lagging_wal_timeout: Option<Duration>,
+    pub max_lsn_wal_lag: Option<NonZeroU64>,
 }

 impl TenantConfOpt {
@@ -108,6 +128,13 @@ impl TenantConfOpt {
                .image_creation_threshold
                .unwrap_or(global_conf.image_creation_threshold),
            pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
+            walreceiver_connect_timeout: self
+                .walreceiver_connect_timeout
+                .unwrap_or(global_conf.walreceiver_connect_timeout),
+            lagging_wal_timeout: self
+                .lagging_wal_timeout
+                .unwrap_or(global_conf.lagging_wal_timeout),
+            max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
        }
    }

@@ -136,6 +163,15 @@ impl TenantConfOpt {
        if let Some(pitr_interval) = other.pitr_interval {
            self.pitr_interval = Some(pitr_interval);
        }
+        if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout {
+            self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout);
+        }
+        if let Some(lagging_wal_timeout) = other.lagging_wal_timeout {
+            self.lagging_wal_timeout = Some(lagging_wal_timeout);
+        }
+        if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
+            self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
+        }
    }
 }

@@ -155,6 +191,14 @@ impl TenantConf {
            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
                .expect("cannot parse default PITR interval"),
+            walreceiver_connect_timeout: humantime::parse_duration(
+                DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
+            )
+            .expect("cannot parse default walreceiver connect timeout"),
+            lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT)
+                .expect("cannot parse default walreceiver lagging wal timeout"),
+            max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
+                .expect("cannot parse default max walreceiver Lsn wal lag"),
        }
    }

@@ -175,6 +219,16 @@ impl TenantConf {
            gc_period: Duration::from_secs(10),
            image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
            pitr_interval: Duration::from_secs(60 * 60),
+            walreceiver_connect_timeout: humantime::parse_duration(
+                defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
+            )
+            .unwrap(),
+            lagging_wal_timeout: humantime::parse_duration(
+                defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT,
+            )
+            .unwrap(),
+            max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
+                .unwrap(),
        }
    }
 }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -8,11 +8,10 @@ use crate::repository::{Repository, TimelineSyncStatusUpdate};
 use crate::storage_sync::index::RemoteIndex;
 use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
 use crate::tenant_config::TenantConfOpt;
-use crate::thread_mgr;
 use crate::thread_mgr::ThreadKind;
-use crate::timelines;
 use crate::timelines::CreateRepo;
 use crate::walredo::PostgresRedoManager;
+use crate::{thread_mgr, timelines, walreceiver};
 use crate::{DatadirTimelineImpl, RepositoryImpl};
 use anyhow::{bail, Context};
 use serde::{Deserialize, Serialize};
@@ -21,23 +20,30 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::fmt;
 use std::sync::Arc;
+use tokio::sync::mpsc;
 use tracing::*;
 use utils::lsn::Lsn;

-use utils::zid::{ZTenantId, ZTimelineId};
+use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};

 mod tenants_state {
+    use anyhow::ensure;
    use std::{
        collections::HashMap,
        sync::{RwLock, RwLockReadGuard, RwLockWriteGuard},
    };
+    use tokio::sync::mpsc;
+    use tracing::{debug, error};

    use utils::zid::ZTenantId;

-    use crate::tenant_mgr::Tenant;
+    use crate::tenant_mgr::{LocalTimelineUpdate, Tenant};

    lazy_static::lazy_static! {
        static ref TENANTS: RwLock<HashMap<ZTenantId, Tenant>> = RwLock::new(HashMap::new());
+        /// Sends updates to the local timelines (creation and deletion) to the WAL receiver,
+        /// so that it can enable/disable corresponding processes.
+        static ref TIMELINE_UPDATE_SENDER: RwLock<Option<mpsc::UnboundedSender<LocalTimelineUpdate>>> = RwLock::new(None);
    }

    pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<ZTenantId, Tenant>> {
@@ -51,6 +57,39 @@ mod tenants_state {
            .write()
            .expect("Failed to write() tenants lock, it got poisoned")
    }
+
+    pub(super) fn set_timeline_update_sender(
+        timeline_updates_sender: mpsc::UnboundedSender<LocalTimelineUpdate>,
+    ) -> anyhow::Result<()> {
+        let mut sender_guard = TIMELINE_UPDATE_SENDER
+            .write()
+            .expect("Failed to write() timeline_update_sender lock, it got poisoned");
+        ensure!(sender_guard.is_none(), "Timeline update sender already set");
+        *sender_guard = Some(timeline_updates_sender);
+        Ok(())
+    }
+
+    pub(super) fn try_send_timeline_update(update: LocalTimelineUpdate) {
+        match TIMELINE_UPDATE_SENDER
+            .read()
+            .expect("Failed to read() timeline_update_sender lock, it got poisoned")
+            .as_ref()
+        {
+            Some(sender) => {
+                if let Err(e) = sender.send(update) {
+                    error!("Failed to send timeline update: {}", e);
+                }
+            }
+            None => debug!("Timeline update sender is not enabled, cannot send update {update:?}"),
+        }
+    }
+
+    pub(super) fn stop_timeline_update_sender() {
+        TIMELINE_UPDATE_SENDER
+            .write()
+            .expect("Failed to write() timeline_update_sender lock, it got poisoned")
+            .take();
+    }
 }

 struct Tenant {
@@ -87,10 +126,10 @@ pub enum TenantState {
 impl fmt::Display for TenantState {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            TenantState::Active => f.write_str("Active"),
-            TenantState::Idle => f.write_str("Idle"),
-            TenantState::Stopping => f.write_str("Stopping"),
-            TenantState::Broken => f.write_str("Broken"),
+            Self::Active => f.write_str("Active"),
+            Self::Idle => f.write_str("Idle"),
+            Self::Stopping => f.write_str("Stopping"),
+            Self::Broken => f.write_str("Broken"),
        }
    }
 }
@@ -99,6 +138,11 @@ impl fmt::Display for TenantState {
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the repository once download is completed.
 pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIndex> {
+    let (timeline_updates_sender, timeline_updates_receiver) =
+        mpsc::unbounded_channel::<LocalTimelineUpdate>();
+    tenants_state::set_timeline_update_sender(timeline_updates_sender)?;
+    walreceiver::init_wal_receiver_main_thread(conf, timeline_updates_receiver)?;
+
    let SyncStartupData {
        remote_index,
        local_timeline_init_statuses,
@@ -113,16 +157,27 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
            // loading a tenant is serious, but it's better to complete the startup and
            // serve other tenants, than fail completely.
            error!("Failed to initialize local tenant {tenant_id}: {:?}", err);
-            let mut m = tenants_state::write_tenants();
-            if let Some(tenant) = m.get_mut(&tenant_id) {
-                tenant.state = TenantState::Broken;
-            }
+            set_tenant_state(tenant_id, TenantState::Broken)?;
        }
    }

    Ok(remote_index)
 }

+pub enum LocalTimelineUpdate {
+    Detach(ZTenantTimelineId),
+    Attach(ZTenantTimelineId, Arc<DatadirTimelineImpl>),
+}
+
+impl std::fmt::Debug for LocalTimelineUpdate {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Detach(ttid) => f.debug_tuple("Remove").field(ttid).finish(),
+            Self::Attach(ttid, _) => f.debug_tuple("Add").field(ttid).finish(),
+        }
+    }
+}
+
 /// Updates tenants' repositories, changing their timelines state in memory.
 pub fn apply_timeline_sync_status_updates(
    conf: &'static PageServerConf,
@@ -160,6 +215,7 @@ pub fn apply_timeline_sync_status_updates(
 /// Shut down all tenants. This runs as part of pageserver shutdown.
 ///
 pub fn shutdown_all_tenants() {
+    tenants_state::stop_timeline_update_sender();
    let mut m = tenants_state::write_tenants();
    let mut tenantids = Vec::new();
    for (tenantid, tenant) in m.iter_mut() {
@@ -173,7 +229,7 @@ pub fn shutdown_all_tenants() {
    }
    drop(m);

-    thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None);
+    thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None);
    thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
    thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None);

@@ -247,32 +303,49 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
    Some(tenants_state::read_tenants().get(&tenantid)?.state)
 }

-///
-/// Change the state of a tenant to Active and launch its compactor and GC
-/// threads. If the tenant was already in Active state or Stopping, does nothing.
-///
-pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
+pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow::Result<()> {
    let mut m = tenants_state::write_tenants();
    let tenant = m
        .get_mut(&tenant_id)
        .with_context(|| format!("Tenant not found for id {tenant_id}"))?;
+    let old_state = tenant.state;
+    tenant.state = new_state;
+    drop(m);

-    info!("activating tenant {tenant_id}");
-
-    match tenant.state {
-        // If the tenant is already active, nothing to do.
-        TenantState::Active => {}
-
-        // If it's Idle, launch the compactor and GC threads
-        TenantState::Idle => {
-            thread_mgr::spawn(
+    match (old_state, new_state) {
+        (TenantState::Broken, TenantState::Broken)
+        | (TenantState::Active, TenantState::Active)
+        | (TenantState::Idle, TenantState::Idle)
+        | (TenantState::Stopping, TenantState::Stopping) => {
+            debug!("tenant {tenant_id} already in state {new_state}");
+        }
+        (TenantState::Broken, ignored) => {
+            debug!("Ignoring {ignored} since tenant {tenant_id} is in broken state");
+        }
+        (_, TenantState::Broken) => {
+            debug!("Setting tenant {tenant_id} status to broken");
+        }
+        (TenantState::Stopping, ignored) => {
+            debug!("Ignoring {ignored} since tenant {tenant_id} is in stopping state");
+        }
+        (TenantState::Idle, TenantState::Active) => {
+            info!("activating tenant {tenant_id}");
+            let compactor_spawn_result = thread_mgr::spawn(
                ThreadKind::Compactor,
                Some(tenant_id),
                None,
                "Compactor thread",
                false,
                move || crate::tenant_threads::compact_loop(tenant_id),
-            )?;
+            );
+            if compactor_spawn_result.is_err() {
+                let mut m = tenants_state::write_tenants();
+                m.get_mut(&tenant_id)
+                    .with_context(|| format!("Tenant not found for id {tenant_id}"))?
+                    .state = old_state;
+                drop(m);
+            }
+            compactor_spawn_result?;

            let gc_spawn_result = thread_mgr::spawn(
                ThreadKind::GarbageCollector,
@@ -286,21 +359,31 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
            .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));

            if let Err(e) = &gc_spawn_result {
+                let mut m = tenants_state::write_tenants();
+                m.get_mut(&tenant_id)
+                    .with_context(|| format!("Tenant not found for id {tenant_id}"))?
+                    .state = old_state;
+                drop(m);
                error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}");
                thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
                return gc_spawn_result;
            }
-            tenant.state = TenantState::Active;
        }
-
-        TenantState::Stopping => {
-            // don't re-activate it if it's being stopped
+        (TenantState::Idle, TenantState::Stopping) => {
+            info!("stopping idle tenant {tenant_id}");
        }
-
-        TenantState::Broken => {
-            // cannot activate
+        (TenantState::Active, TenantState::Stopping | TenantState::Idle) => {
+            info!("stopping tenant {tenant_id} threads due to new state {new_state}");
+            thread_mgr::shutdown_threads(
+                Some(ThreadKind::WalReceiverManager),
+                Some(tenant_id),
+                None,
+            );
+            thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None);
+            thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
        }
    }
+
    Ok(())
 }

@@ -325,15 +408,15 @@ pub fn get_local_timeline_with_load(
        .with_context(|| format!("Tenant {tenant_id} not found"))?;

    if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) {
-        return Ok(Arc::clone(page_tline));
+        Ok(Arc::clone(page_tline))
+    } else {
+        let page_tline = load_local_timeline(&tenant.repo, timeline_id)
+            .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?;
+        tenant
+            .local_timelines
+            .insert(timeline_id, Arc::clone(&page_tline));
+        Ok(page_tline)
    }
-
-    let page_tline = load_local_timeline(&tenant.repo, timeline_id)
-        .with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?;
-    tenant
-        .local_timelines
-        .insert(timeline_id, Arc::clone(&page_tline));
-    Ok(page_tline)
 }

 pub fn detach_timeline(
@@ -351,6 +434,9 @@ pub fn detach_timeline(
                .detach_timeline(timeline_id)
                .context("Failed to detach inmem tenant timeline")?;
            tenant.local_timelines.remove(&timeline_id);
+            tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach(
+                ZTenantTimelineId::new(tenant_id, timeline_id),
+            ));
        }
        None => bail!("Tenant {tenant_id} not found in local tenant state"),
    }
@@ -379,6 +465,12 @@ fn load_local_timeline(
        repartition_distance,
    ));
    page_tline.init_logical_size()?;
+
+    tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach(
+        ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
+        Arc::clone(&page_tline),
+    ));
+
    Ok(page_tline)
 }

--- a/pageserver/src/thread_mgr.rs
+++ b/pageserver/src/thread_mgr.rs
@@ -91,8 +91,8 @@ pub enum ThreadKind {
    // associated with one later, after receiving a command from the client.
    PageRequestHandler,

-    // Thread that connects to a safekeeper to fetch WAL for one timeline.
-    WalReceiver,
+    // Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL.
+    WalReceiverManager,

    // Thread that handles compaction of all timelines for a tenant.
    Compactor,
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
--- a/pageserver/src/walreceiver/connection_handler.rs
+++ b/pageserver/src/walreceiver/connection_handler.rs
@@ -0,0 +1,405 @@
+//! Actual Postgres connection handler to stream WAL to the server.
+//! Runs as a separate, cancellable Tokio task.
+use std::{
+    str::FromStr,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
+
+use anyhow::{bail, ensure, Context};
+use bytes::BytesMut;
+use fail::fail_point;
+use postgres::{SimpleQueryMessage, SimpleQueryRow};
+use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_protocol::message::backend::ReplicationMessage;
+use postgres_types::PgLsn;
+use tokio::{pin, select, sync::watch, time};
+use tokio_postgres::{replication::ReplicationStream, Client};
+use tokio_stream::StreamExt;
+use tracing::{debug, error, info, info_span, trace, warn, Instrument};
+use utils::{
+    lsn::Lsn,
+    pq_proto::ZenithFeedback,
+    zid::{NodeId, ZTenantTimelineId},
+};
+
+use crate::{
+    http::models::WalReceiverEntry,
+    repository::{Repository, Timeline},
+    tenant_mgr,
+    walingest::WalIngest,
+};
+
+#[derive(Debug, Clone)]
+pub enum WalConnectionEvent {
+    Started,
+    NewWal(ZenithFeedback),
+    End(Result<(), String>),
+}
+
+/// A wrapper around standalone Tokio task, to poll its updates or cancel the task.
+#[derive(Debug)]
+pub struct WalReceiverConnection {
+    handle: tokio::task::JoinHandle<()>,
+    cancellation: watch::Sender<()>,
+    events_receiver: watch::Receiver<WalConnectionEvent>,
+}
+
+impl WalReceiverConnection {
+    /// Initializes the connection task, returning a set of handles on top of it.
+    /// The task is started immediately after the creation, fails if no connection is established during the timeout given.
+    pub fn open(
+        id: ZTenantTimelineId,
+        safekeeper_id: NodeId,
+        wal_producer_connstr: String,
+        connect_timeout: Duration,
+    ) -> Self {
+        let (cancellation, mut cancellation_receiver) = watch::channel(());
+        let (events_sender, events_receiver) = watch::channel(WalConnectionEvent::Started);
+
+        let handle = tokio::spawn(
+            async move {
+                let connection_result = handle_walreceiver_connection(
+                    id,
+                    &wal_producer_connstr,
+                    &events_sender,
+                    &mut cancellation_receiver,
+                    connect_timeout,
+                )
+                .await
+                .map_err(|e| {
+                    format!("Walreceiver connection for id {id} failed with error: {e:#}")
+                });
+
+                match &connection_result {
+                    Ok(()) => {
+                        debug!("Walreceiver connection for id {id} ended successfully")
+                    }
+                    Err(e) => warn!("{e}"),
+                }
+                events_sender
+                    .send(WalConnectionEvent::End(connection_result))
+                    .ok();
+            }
+            .instrument(info_span!("safekeeper_handle", sk = %safekeeper_id)),
+        );
+
+        Self {
+            handle,
+            cancellation,
+            events_receiver,
+        }
+    }
+
+    /// Polls for the next WAL receiver event, if there's any available since the last check.
+    /// Blocks if there's no new event available, returns `None` if no new events will ever occur.
+    /// Only the last event is returned, all events received between observatins are lost.
+    pub async fn next_event(&mut self) -> Option<WalConnectionEvent> {
+        match self.events_receiver.changed().await {
+            Ok(()) => Some(self.events_receiver.borrow().clone()),
+            Err(_cancellation_error) => None,
+        }
+    }
+
+    /// Gracefully aborts current WAL streaming task, waiting for the current WAL streamed.
+    pub async fn shutdown(&mut self) -> anyhow::Result<()> {
+        self.cancellation.send(()).ok();
+        let handle = &mut self.handle;
+        handle
+            .await
+            .context("Failed to join on a walreceiver connection task")?;
+        Ok(())
+    }
+}
+
+async fn handle_walreceiver_connection(
+    id: ZTenantTimelineId,
+    wal_producer_connstr: &str,
+    events_sender: &watch::Sender<WalConnectionEvent>,
+    cancellation: &mut watch::Receiver<()>,
+    connect_timeout: Duration,
+) -> anyhow::Result<()> {
+    // Connect to the database in replication mode.
+    info!("connecting to {wal_producer_connstr}");
+    let connect_cfg =
+        format!("{wal_producer_connstr} application_name=pageserver replication=true");
+
+    let (mut replication_client, connection) = time::timeout(
+        connect_timeout,
+        tokio_postgres::connect(&connect_cfg, postgres::NoTls),
+    )
+    .await
+    .context("Timed out while waiting for walreceiver connection to open")?
+    .context("Failed to open walreceiver conection")?;
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    let mut connection_cancellation = cancellation.clone();
+    tokio::spawn(
+        async move {
+            info!("connected!");
+            select! {
+                    connection_result = connection => match connection_result{
+                            Ok(()) => info!("Walreceiver db connection closed"),
+                            Err(connection_error) => {
+                                if connection_error.is_closed() {
+                                    info!("Connection closed regularly: {connection_error}")
+                                } else {
+                                    warn!("Connection aborted: {connection_error}")
+                                }
+                            }
+                        },
+
+                    _ = connection_cancellation.changed() => info!("Connection cancelled"),
+            }
+        }
+        .instrument(info_span!("safekeeper_handle_db")),
+    );
+
+    // Immediately increment the gauge, then create a job to decrement it on task exit.
+    // One of the pros of `defer!` is that this will *most probably*
+    // get called, even in presence of panics.
+    let gauge = crate::LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
+    gauge.inc();
+    scopeguard::defer! {
+        gauge.dec();
+    }
+
+    let identify = identify_system(&mut replication_client).await?;
+    info!("{identify:?}");
+    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
+    let mut caught_up = false;
+    let ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    } = id;
+
+    let (repo, timeline) = tokio::task::spawn_blocking(move || {
+        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
+            .with_context(|| format!("no repository found for tenant {tenant_id}"))?;
+        let timeline = tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)
+            .with_context(|| {
+                format!("local timeline {timeline_id} not found for tenant {tenant_id}")
+            })?;
+        Ok::<_, anyhow::Error>((repo, timeline))
+    })
+    .await
+    .with_context(|| format!("Failed to spawn blocking task to get repository and timeline for tenant {tenant_id} timeline {timeline_id}"))??;
+
+    //
+    // Start streaming the WAL, from where we left off previously.
+    //
+    // If we had previously received WAL up to some point in the middle of a WAL record, we
+    // better start from the end of last full WAL record, not in the middle of one.
+    let mut last_rec_lsn = timeline.get_last_record_lsn();
+    let mut startpoint = last_rec_lsn;
+
+    if startpoint == Lsn(0) {
+        bail!("No previous WAL position");
+    }
+
+    // There might be some padding after the last full record, skip it.
+    startpoint += startpoint.calc_padding(8u32);
+
+    info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, server is at {end_of_wal}...");
+
+    let query = format!("START_REPLICATION PHYSICAL {startpoint}");
+
+    let copy_stream = replication_client.copy_both_simple(&query).await?;
+    let physical_stream = ReplicationStream::new(copy_stream);
+    pin!(physical_stream);
+
+    let mut waldecoder = WalStreamDecoder::new(startpoint);
+
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?;
+
+    while let Some(replication_message) = {
+        select! {
+            // check for shutdown first
+            biased;
+            _ = cancellation.changed() => {
+                info!("walreceiver interrupted");
+                None
+            }
+            replication_message = physical_stream.next() => replication_message,
+        }
+    } {
+        let replication_message = replication_message?;
+        let status_update = match replication_message {
+            ReplicationMessage::XLogData(xlog_data) => {
+                // Pass the WAL data to the decoder, and see if we can decode
+                // more records as a result.
+                let data = xlog_data.data();
+                let startlsn = Lsn::from(xlog_data.wal_start());
+                let endlsn = startlsn + data.len() as u64;
+
+                trace!("received XLogData between {startlsn} and {endlsn}");
+
+                waldecoder.feed_bytes(data);
+
+                while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                    let _enter = info_span!("processing record", lsn = %lsn).entered();
+
+                    // It is important to deal with the aligned records as lsn in getPage@LSN is
+                    // aligned and can be several bytes bigger. Without this alignment we are
+                    // at risk of hitting a deadlock.
+                    ensure!(lsn.is_aligned());
+
+                    walingest.ingest_record(&timeline, recdata, lsn)?;
+
+                    fail_point!("walreceiver-after-ingest");
+
+                    last_rec_lsn = lsn;
+                }
+
+                if !caught_up && endlsn >= end_of_wal {
+                    info!("caught up at LSN {endlsn}");
+                    caught_up = true;
+                }
+
+                let timeline_to_check = Arc::clone(&timeline.tline);
+                tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance())
+                    .await
+                    .with_context(|| {
+                        format!("Spawned checkpoint check task panicked for timeline {id}")
+                    })?
+                    .with_context(|| {
+                        format!("Failed to check checkpoint distance for timeline {id}")
+                    })?;
+
+                Some(endlsn)
+            }
+
+            ReplicationMessage::PrimaryKeepAlive(keepalive) => {
+                let wal_end = keepalive.wal_end();
+                let timestamp = keepalive.timestamp();
+                let reply_requested = keepalive.reply() != 0;
+
+                trace!("received PrimaryKeepAlive(wal_end: {wal_end}, timestamp: {timestamp:?} reply: {reply_requested})");
+
+                if reply_requested {
+                    Some(last_rec_lsn)
+                } else {
+                    None
+                }
+            }
+
+            _ => None,
+        };
+
+        if let Some(last_lsn) = status_update {
+            let remote_index = repo.get_remote_index();
+            let timeline_remote_consistent_lsn = remote_index
+                .read()
+                .await
+                // here we either do not have this timeline in remote index
+                // or there were no checkpoints for it yet
+                .timeline_entry(&ZTenantTimelineId {
+                    tenant_id,
+                    timeline_id,
+                })
+                .map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn())
+                // no checkpoint was uploaded
+                .unwrap_or(Lsn(0));
+
+            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
+            let write_lsn = u64::from(last_lsn);
+            // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
+            let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn());
+            // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
+            // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
+            let apply_lsn = u64::from(timeline_remote_consistent_lsn);
+            let ts = SystemTime::now();
+
+            // Update the current WAL receiver's data stored inside the global hash table `WAL_RECEIVERS`
+            {
+                super::WAL_RECEIVER_ENTRIES.write().await.insert(
+                    id,
+                    WalReceiverEntry {
+                        wal_producer_connstr: Some(wal_producer_connstr.to_owned()),
+                        last_received_msg_lsn: Some(last_lsn),
+                        last_received_msg_ts: Some(
+                            ts.duration_since(SystemTime::UNIX_EPOCH)
+                                .expect("Received message time should be before UNIX EPOCH!")
+                                .as_micros(),
+                        ),
+                    },
+                );
+            }
+
+            // Send zenith feedback message.
+            // Regular standby_status_update fields are put into this message.
+            let zenith_status_update = ZenithFeedback {
+                current_timeline_size: timeline.get_current_logical_size() as u64,
+                ps_writelsn: write_lsn,
+                ps_flushlsn: flush_lsn,
+                ps_applylsn: apply_lsn,
+                ps_replytime: ts,
+            };
+
+            debug!("zenith_status_update {zenith_status_update:?}");
+
+            let mut data = BytesMut::new();
+            zenith_status_update.serialize(&mut data)?;
+            physical_stream
+                .as_mut()
+                .zenith_status_update(data.len() as u64, &data)
+                .await?;
+            if let Err(e) = events_sender.send(WalConnectionEvent::NewWal(zenith_status_update)) {
+                warn!("Wal connection event listener dropped, aborting the connection: {e}");
+                return Ok(());
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Data returned from the postgres `IDENTIFY_SYSTEM` command
+///
+/// See the [postgres docs] for more details.
+///
+/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html
+#[derive(Debug)]
+// As of nightly 2021-09-11, fields that are only read by the type's `Debug` impl still count as
+// unused. Relevant issue: https://github.com/rust-lang/rust/issues/88900
+#[allow(dead_code)]
+struct IdentifySystem {
+    systemid: u64,
+    timeline: u32,
+    xlogpos: PgLsn,
+    dbname: Option<String>,
+}
+
+/// There was a problem parsing the response to
+/// a postgres IDENTIFY_SYSTEM command.
+#[derive(Debug, thiserror::Error)]
+#[error("IDENTIFY_SYSTEM parse error")]
+struct IdentifyError;
+
+/// Run the postgres `IDENTIFY_SYSTEM` command
+async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem> {
+    let query_str = "IDENTIFY_SYSTEM";
+    let response = client.simple_query(query_str).await?;
+
+    // get(N) from row, then parse it as some destination type.
+    fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
+    where
+        T: FromStr,
+    {
+        let val = row.get(idx).ok_or(IdentifyError)?;
+        val.parse::<T>().or(Err(IdentifyError))
+    }
+
+    // extract the row contents into an IdentifySystem struct.
+    // written as a closure so I can use ? for Option here.
+    if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
+        Ok(IdentifySystem {
+            systemid: get_parse(first_row, 0)?,
+            timeline: get_parse(first_row, 1)?,
+            xlogpos: get_parse(first_row, 2)?,
+            dbname: get_parse(first_row, 3).ok(),
+        })
+    } else {
+        Err(IdentifyError.into())
+    }
+}