Merge remote-tracking branch 'origin/main' into problame/standby-horizon-removal-poc-rip-out

2026-05-25 09:00:37 +00:00 · 2025-07-09 11:10:53 +00:00
parent 6e5a83d34d 5ea0bb2d4f
commit ba4778831b
193 changed files with 8745 additions and 3023 deletions
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -18,9 +18,10 @@ use metrics::set_build_info_metric;
 use remote_storage::RemoteStorageConfig;
 use safekeeper::defaults::{
    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
-    DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
-    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE,
-    DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
+    DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
+    DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES, DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES,
+    DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
+    DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
 use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
@@ -138,6 +139,15 @@ struct Args {
    /// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
    #[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
    max_offloader_lag: u64,
+    /* BEGIN_HADRON */
+    /// Safekeeper will re-elect a new offloader if the current backup lagging for more than this value in bytes
+    #[arg(long, default_value_t = DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES)]
+    max_reelect_offloader_lag_bytes: u64,
+    /// Safekeeper will stop accepting new WALs if the timeline disk usage exceeds this value in bytes.
+    /// Setting this value to 0 disables the limit.
+    #[arg(long, default_value_t = DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES)]
+    max_timeline_disk_usage_bytes: u64,
+    /* END_HADRON */
    /// Number of max parallel WAL segments to be offloaded to remote storage.
    #[arg(long, default_value = "5")]
    wal_backup_parallel_jobs: usize,
@@ -391,6 +401,10 @@ async fn main() -> anyhow::Result<()> {
        peer_recovery_enabled: args.peer_recovery,
        remote_storage: args.remote_storage,
        max_offloader_lag_bytes: args.max_offloader_lag,
+        /* BEGIN_HADRON */
+        max_reelect_offloader_lag_bytes: args.max_reelect_offloader_lag_bytes,
+        max_timeline_disk_usage_bytes: args.max_timeline_disk_usage_bytes,
+        /* END_HADRON */
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
        pg_auth,
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -17,6 +17,7 @@ use utils::crashsafe::durable_rename;

 use crate::control_file_upgrade::{downgrade_v10_to_v9, upgrade_control_file};
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
+use crate::metrics::WAL_DISK_IO_ERRORS;
 use crate::state::{EvictionState, TimelinePersistentState};

 pub const SK_MAGIC: u32 = 0xcafeceefu32;
@@ -192,11 +193,14 @@ impl TimelinePersistentState {
 impl Storage for FileStorage {
    /// Persists state durably to the underlying storage.
    async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
+        // start timer for metrics
        let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();
-
        // write data to safekeeper.control.partial
        let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
        let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
+            /* BEGIN_HADRON */
+            WAL_DISK_IO_ERRORS.inc();
+            /*END_HADRON */
            format!(
                "failed to create partial control file at: {}",
                &control_partial_path
@@ -206,14 +210,24 @@ impl Storage for FileStorage {
        let buf: Vec<u8> = s.write_to_buf()?;

        control_partial.write_all(&buf).await.with_context(|| {
+            /* BEGIN_HADRON */
+            WAL_DISK_IO_ERRORS.inc();
+            /*END_HADRON */
            format!("failed to write safekeeper state into control file at: {control_partial_path}")
        })?;
        control_partial.flush().await.with_context(|| {
+            /* BEGIN_HADRON */
+            WAL_DISK_IO_ERRORS.inc();
+            /*END_HADRON */
            format!("failed to flush safekeeper state into control file at: {control_partial_path}")
        })?;

        let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
-        durable_rename(&control_partial_path, &control_path, !self.no_sync).await?;
+        durable_rename(&control_partial_path, &control_path, !self.no_sync)
+            .await
+            /* BEGIN_HADRON */
+            .inspect_err(|_| WAL_DISK_IO_ERRORS.inc())?;
+        /* END_HADRON */

        // update internal state
        self.state = s.clone();
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -61,6 +61,13 @@ pub mod defaults {

    pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
    pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
+    /* BEGIN_HADRON */
+    // Default leader re-elect is 0(disabled). SK will re-elect leader if the current leader is lagging this many bytes.
+    pub const DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES: u64 = 0;
+    // Default disk usage limit is 0 (disabled). It means each timeline by default can use up to this many WAL
+    // disk space on this SK until SK begins to reject WALs.
+    pub const DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES: u64 = 0;
+    /* END_HADRON */
    pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
    pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
    pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
@@ -99,6 +106,10 @@ pub struct SafeKeeperConf {
    pub peer_recovery_enabled: bool,
    pub remote_storage: Option<RemoteStorageConfig>,
    pub max_offloader_lag_bytes: u64,
+    /* BEGIN_HADRON */
+    pub max_reelect_offloader_lag_bytes: u64,
+    pub max_timeline_disk_usage_bytes: u64,
+    /* END_HADRON */
    pub backup_parallel_jobs: usize,
    pub wal_backup_enabled: bool,
    pub pg_auth: Option<Arc<JwtAuth>>,
@@ -151,6 +162,10 @@ impl SafeKeeperConf {
            sk_auth_token: None,
            heartbeat_timeout: Duration::new(5, 0),
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
+            /* BEGIN_HADRON */
+            max_reelect_offloader_lag_bytes: defaults::DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES,
+            max_timeline_disk_usage_bytes: defaults::DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES,
+            /* END_HADRON */
            current_thread_runtime: false,
            walsenders_keep_horizon: false,
            partial_backup_timeout: Duration::from_secs(0),
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -58,6 +58,25 @@ pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_flush_wal_seconds histogram")
 });
+/* BEGIN_HADRON */
+pub static WAL_DISK_IO_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_wal_disk_io_errors",
+        "Number of disk I/O errors when creating and flushing WALs and control files"
+    )
+    .expect("Failed to register safekeeper_wal_disk_io_errors counter")
+});
+pub static WAL_STORAGE_LIMIT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_wal_storage_limit_errors",
+        concat!(
+            "Number of errors due to timeline WAL storage utilization exceeding configured limit. ",
+            "An increase in this metric indicates issues backing up or removing WALs."
+        )
+    )
+    .expect("Failed to register safekeeper_wal_storage_limit_errors counter")
+});
+/* END_HADRON */
 pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "safekeeper_persist_control_file_seconds",
@@ -138,6 +157,15 @@ pub static BACKUP_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_backup_errors_total counter")
 });
+/* BEGIN_HADRON */
+pub static BACKUP_REELECT_LEADER_COUNT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_backup_reelect_leader_total",
+        "Number of times the backup leader was reelected"
+    )
+    .expect("Failed to register safekeeper_backup_reelect_leader_total counter")
+});
+/* END_HADRON */
 pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "safekeeper_broker_push_update_seconds",
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -16,7 +16,7 @@ use tokio::sync::mpsc::error::SendError;
 use tokio::task::JoinHandle;
 use tokio::time::MissedTickBehavior;
 use tracing::{Instrument, error, info, info_span};
-use utils::critical;
+use utils::critical_timeline;
 use utils::lsn::Lsn;
 use utils::postgres_client::{Compression, InterpretedFormat};
 use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
@@ -268,6 +268,8 @@ impl InterpretedWalReader {

        let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();

+        let ttid = wal_stream.ttid;
+
        let reader = InterpretedWalReader {
            wal_stream,
            shard_senders: HashMap::from([(
@@ -300,7 +302,11 @@ impl InterpretedWalReader {
                    .inspect_err(|err| match err {
                        // TODO: we may want to differentiate these errors further.
                        InterpretedWalReaderError::Decode(_) => {
-                            critical!("failed to decode WAL record: {err:?}");
+                            critical_timeline!(
+                                ttid.tenant_id,
+                                ttid.timeline_id,
+                                "failed to read WAL record: {err:?}"
+                            );
                        }
                        err => error!("failed to read WAL record: {err}"),
                    })
@@ -363,9 +369,14 @@ impl InterpretedWalReader {
            metric.dec();
        }

+        let ttid = self.wal_stream.ttid;
        match self.run_impl(start_pos).await {
            Err(err @ InterpretedWalReaderError::Decode(_)) => {
-                critical!("failed to decode WAL record: {err:?}");
+                critical_timeline!(
+                    ttid.tenant_id,
+                    ttid.timeline_id,
+                    "failed to decode WAL record: {err:?}"
+                );
            }
            Err(err) => error!("failed to read WAL record: {err}"),
            Ok(()) => info!("interpreted wal reader exiting"),
@@ -550,6 +561,20 @@ impl InterpretedWalReader {
                        // Update internal and external state, then reset the WAL stream
                        // if required.
                        let senders = self.shard_senders.entry(shard_id).or_default();
+
+                        // Clean up any shard senders that have dropped out before adding the new
+                        // one. This avoids a build up of dead senders.
+                        senders.retain(|sender| {
+                            let closed = sender.tx.is_closed();
+
+                            if closed {
+                                let sender_id = ShardSenderId::new(shard_id, sender.sender_id);
+                                tracing::info!("Removed shard sender {}", sender_id);
+                            }
+
+                            !closed
+                        });
+
                        let new_sender_id = match senders.last() {
                            Some(sender) => sender.sender_id.next(),
                            None => SenderId::first()
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -26,7 +26,9 @@ use utils::id::{NodeId, TenantId, TenantTimelineId};
 use utils::lsn::Lsn;
 use utils::sync::gate::Gate;

-use crate::metrics::{FullTimelineInfo, MISC_OPERATION_SECONDS, WalStorageMetrics};
+use crate::metrics::{
+    FullTimelineInfo, MISC_OPERATION_SECONDS, WAL_STORAGE_LIMIT_ERRORS, WalStorageMetrics,
+};
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
@@ -195,7 +197,7 @@ impl StateSK {
        Ok(TimelineMembershipSwitchResponse {
            previous_conf: result.previous_conf,
            current_conf: result.current_conf,
-            term: self.state().acceptor_state.term,
+            last_log_term: self.state().acceptor_state.term,
            flush_lsn: self.flush_lsn(),
        })
    }
@@ -1047,6 +1049,39 @@ impl WalResidentTimeline {
        Ok(ss)
    }

+    // BEGIN HADRON
+    // Check if disk usage by WAL segment files for this timeline exceeds the configured limit.
+    fn hadron_check_disk_usage(
+        &self,
+        shared_state_locked: &mut WriteGuardSharedState<'_>,
+    ) -> Result<()> {
+        // The disk usage is calculated based on the number of segments between `last_removed_segno`
+        // and the current flush LSN segment number. `last_removed_segno` is advanced after
+        // unneeded WAL files are physically removed from disk (see `update_wal_removal_end()`
+        // in `timeline_manager.rs`).
+        let max_timeline_disk_usage_bytes = self.conf.max_timeline_disk_usage_bytes;
+        if max_timeline_disk_usage_bytes > 0 {
+            let last_removed_segno = self.last_removed_segno.load(Ordering::Relaxed);
+            let flush_lsn = shared_state_locked.sk.flush_lsn();
+            let wal_seg_size = shared_state_locked.sk.state().server.wal_seg_size as u64;
+            let current_segno = flush_lsn.segment_number(wal_seg_size as usize);
+
+            let segno_count = current_segno - last_removed_segno;
+            let disk_usage_bytes = segno_count * wal_seg_size;
+
+            if disk_usage_bytes > max_timeline_disk_usage_bytes {
+                WAL_STORAGE_LIMIT_ERRORS.inc();
+                bail!(
+                    "WAL storage utilization exceeds configured limit of {} bytes: current disk usage: {} bytes",
+                    max_timeline_disk_usage_bytes,
+                    disk_usage_bytes
+                );
+            }
+        }
+        Ok(())
+    }
+    // END HADRON
+
    /// Pass arrived message to the safekeeper.
    pub async fn process_msg(
        &self,
@@ -1059,6 +1094,13 @@ impl WalResidentTimeline {
        let mut rmsg: Option<AcceptorProposerMessage>;
        {
            let mut shared_state = self.write_shared_state().await;
+            // BEGIN HADRON
+            // Errors from the `hadron_check_disk_usage()` function fail the process_msg() function, which
+            // gets propagated upward and terminates the entire WalAcceptor. This will cause postgres to
+            // disconnect from the safekeeper and reestablish another connection. Postgres will keep retrying
+            // safekeeper connections every second until it can successfully propose WAL to the SK again.
+            self.hadron_check_disk_usage(&mut shared_state)?;
+            // END HADRON
            rmsg = shared_state.sk.safekeeper().process_msg(msg).await?;

            // if this is AppendResponse, fill in proper hot standby feedback.
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -26,7 +26,9 @@ use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
 use utils::{backoff, pausable_failpoint};

-use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
+use crate::metrics::{
+    BACKED_UP_SEGMENTS, BACKUP_ERRORS, BACKUP_REELECT_LEADER_COUNT, WAL_BACKUP_TASKS,
+};
 use crate::timeline::WalResidentTimeline;
 use crate::timeline_manager::{Manager, StateSnapshot};
 use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME};
@@ -70,8 +72,9 @@ pub(crate) async fn update_task(
    need_backup: bool,
    state: &StateSnapshot,
 ) {
-    let (offloader, election_dbg_str) =
-        determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
+    /* BEGIN_HADRON */
+    let (offloader, election_dbg_str) = hadron_determine_offloader(mgr, state);
+    /* END_HADRON */
    let elected_me = Some(mgr.conf.my_id) == offloader;

    let should_task_run = need_backup && elected_me;
@@ -127,6 +130,70 @@ async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
    }
 }

+/* BEGIN_HADRON */
+// On top of the neon determine_offloader, we also check if the current offloader is lagging behind too much.
+// If it is, we re-elect a new offloader. This mitigates the below issue. It also helps distribute the load across SKs.
+//
+// We observe that the offloader fails to upload a segment due to race conditions on XLOG SWITCH and PG start streaming WALs.
+// wal_backup task continously failing to upload a full segment while the segment remains partial on the disk.
+// The consequence is that commit_lsn for all SKs move forward but backup_lsn stays the same. Then, all SKs run out of disk space.
+// See go/sk-ood-xlog-switch for more details.
+//
+// To mitigate this issue, we will re-elect a new offloader if the current offloader is lagging behind too much.
+// Each SK makes the decision locally but they are aware of each other's commit and backup lsns.
+//
+// determine_offloader will pick a SK. say SK-1.
+// Each SK checks
+// -- if commit_lsn - back_lsn > threshold,
+// -- -- remove SK-1 from the candidate and call determine_offloader again.
+// SK-1 will step down and all SKs will elect the same leader again.
+// After the backup is caught up, the leader will become SK-1 again.
+fn hadron_determine_offloader(mgr: &Manager, state: &StateSnapshot) -> (Option<NodeId>, String) {
+    let mut offloader: Option<NodeId>;
+    let mut election_dbg_str: String;
+    let caughtup_peers_count: usize;
+    (offloader, election_dbg_str, caughtup_peers_count) =
+        determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
+
+    if offloader.is_none()
+        || caughtup_peers_count <= 1
+        || mgr.conf.max_reelect_offloader_lag_bytes == 0
+    {
+        return (offloader, election_dbg_str);
+    }
+
+    let offloader_sk_id = offloader.unwrap();
+
+    let backup_lag = state.commit_lsn.checked_sub(state.backup_lsn);
+    if backup_lag.is_none() {
+        info!("Backup lag is None. Skipping re-election.");
+        return (offloader, election_dbg_str);
+    }
+
+    let backup_lag = backup_lag.unwrap().0;
+
+    if backup_lag < mgr.conf.max_reelect_offloader_lag_bytes {
+        return (offloader, election_dbg_str);
+    }
+
+    info!(
+        "Electing a new leader: Backup lag is too high backup lsn lag {} threshold {}: {}",
+        backup_lag, mgr.conf.max_reelect_offloader_lag_bytes, election_dbg_str
+    );
+    BACKUP_REELECT_LEADER_COUNT.inc();
+    // Remove the current offloader if lag is too high.
+    let new_peers: Vec<_> = state
+        .peers
+        .iter()
+        .filter(|p| p.sk_id != offloader_sk_id)
+        .cloned()
+        .collect();
+    (offloader, election_dbg_str, _) =
+        determine_offloader(&new_peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
+    (offloader, election_dbg_str)
+}
+/* END_HADRON */
+
 /// The goal is to ensure that normally only one safekeepers offloads. However,
 /// it is fine (and inevitable, as s3 doesn't provide CAS) that for some short
 /// time we have several ones as they PUT the same files. Also,
@@ -141,13 +208,13 @@ fn determine_offloader(
    wal_backup_lsn: Lsn,
    ttid: TenantTimelineId,
    conf: &SafeKeeperConf,
-) -> (Option<NodeId>, String) {
+) -> (Option<NodeId>, String, usize) {
    // TODO: remove this once we fill newly joined safekeepers since backup_lsn.
    let capable_peers = alive_peers
        .iter()
        .filter(|p| p.local_start_lsn <= wal_backup_lsn);
    match capable_peers.clone().map(|p| p.commit_lsn).max() {
-        None => (None, "no connected peers to elect from".to_string()),
+        None => (None, "no connected peers to elect from".to_string(), 0),
        Some(max_commit_lsn) => {
            let threshold = max_commit_lsn
                .checked_sub(conf.max_offloader_lag_bytes)
@@ -175,6 +242,7 @@ fn determine_offloader(
                    capable_peers_dbg,
                    caughtup_peers.len()
                ),
+                caughtup_peers.len(),
            )
        }
    }
@@ -346,6 +414,8 @@ async fn backup_lsn_range(
        anyhow::bail!("parallel_jobs must be >= 1");
    }

+    pausable_failpoint!("backup-lsn-range-pausable");
+
    let remote_timeline_path = &timeline.remote_path;
    let start_lsn = *backup_lsn;
    let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -1,15 +1,15 @@
 use std::pin::Pin;
 use std::task::{Context, Poll};

-use bytes::Bytes;
-use futures::stream::BoxStream;
-use futures::{Stream, StreamExt};
-use safekeeper_api::Term;
-use utils::lsn::Lsn;
-
 use crate::send_wal::EndWatch;
 use crate::timeline::WalResidentTimeline;
 use crate::wal_storage::WalReader;
+use bytes::Bytes;
+use futures::stream::BoxStream;
+use futures::{Stream, StreamExt};
+use safekeeper_api::Term;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;

 #[derive(PartialEq, Eq, Debug)]
 pub(crate) struct WalBytes {
@@ -37,6 +37,8 @@ struct PositionedWalReader {
 pub(crate) struct StreamingWalReader {
    stream: BoxStream<'static, WalOrReset>,
    start_changed_tx: tokio::sync::watch::Sender<Lsn>,
+    // HADRON: Added TenantTimelineId for instrumentation purposes.
+    pub(crate) ttid: TenantTimelineId,
 }

 pub(crate) enum WalOrReset {
@@ -63,6 +65,7 @@ impl StreamingWalReader {
        buffer_size: usize,
    ) -> Self {
        let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start);
+        let ttid = tli.ttid;

        let state = WalReaderStreamState {
            tli,
@@ -107,6 +110,7 @@ impl StreamingWalReader {
        Self {
            stream,
            start_changed_tx,
+            ttid,
        }
    }

--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -31,7 +31,8 @@ use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

 use crate::metrics::{
-    REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure,
+    REMOVED_WAL_SEGMENTS, WAL_DISK_IO_ERRORS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics,
+    time_io_closure,
 };
 use crate::state::TimelinePersistentState;
 use crate::wal_backup::{WalBackup, read_object, remote_timeline_path};
@@ -293,9 +294,12 @@ impl PhysicalStorage {
            // half initialized segment, first bake it under tmp filename and
            // then rename.
            let tmp_path = self.timeline_dir.join("waltmp");
-            let file = File::create(&tmp_path)
-                .await
-                .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;
+            let file: File = File::create(&tmp_path).await.with_context(|| {
+                /* BEGIN_HADRON */
+                WAL_DISK_IO_ERRORS.inc();
+                /* END_HADRON */
+                format!("Failed to open tmp wal file {:?}", &tmp_path)
+            })?;

            fail::fail_point!("sk-zero-segment", |_| {
                info!("sk-zero-segment failpoint hit");
@@ -382,7 +386,11 @@ impl PhysicalStorage {

            let flushed = self
                .write_in_segment(segno, xlogoff, &buf[..bytes_write])
-                .await?;
+                .await
+                /* BEGIN_HADRON */
+                .inspect_err(|_| WAL_DISK_IO_ERRORS.inc())?;
+            /* END_HADRON */
+
            self.write_lsn += bytes_write as u64;
            if flushed {
                self.flush_lsn = self.write_lsn;
@@ -491,7 +499,11 @@ impl Storage for PhysicalStorage {
        }

        if let Some(unflushed_file) = self.file.take() {
-            self.fdatasync_file(&unflushed_file).await?;
+            self.fdatasync_file(&unflushed_file)
+                .await
+                /* BEGIN_HADRON */
+                .inspect_err(|_| WAL_DISK_IO_ERRORS.inc())?;
+            /* END_HADRON */
            self.file = Some(unflushed_file);
        } else {
            // We have unflushed data (write_lsn != flush_lsn), but no file. This
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -159,6 +159,10 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
        heartbeat_timeout: Duration::from_secs(0),
        remote_storage: None,
        max_offloader_lag_bytes: 0,
+        /* BEGIN_HADRON */
+        max_reelect_offloader_lag_bytes: 0,
+        max_timeline_disk_usage_bytes: 0,
+        /* END_HADRON */
        wal_backup_enabled: false,
        listen_pg_addr_tenant_only: None,
        advertise_pg_addr: None,