Suppress wal lag timeout warnings right after tenant attachment (#9232)

As seen in https://github.com/neondatabase/cloud/issues/17335, during releases we can have ingest lags that are above the limits for warnings. However, such lags are part of normal pageserver startup. Therefore, calculate a certain cooldown timestamp until which we accept lags up to a certain size. The heuristic is chosen to grow the later we get to fully load the tenant, and we also add 60 seconds as a grace period after that term.
2026-01-08 14:02:55 +00:00 · 2024-10-03 03:33:09 +02:00
parent 1b176fe74a
commit 2d8f6d7906
3 changed files with 50 additions and 1 deletions
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -97,6 +97,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
+use crate::walingest::WalLagCooldown;
 use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
@@ -319,6 +320,9 @@ pub struct Tenant {
    /// background warmup.
    pub(crate) activate_now_sem: tokio::sync::Semaphore,

+    /// Time it took for the tenant to activate. Zero if not active yet.
+    attach_wal_lag_cooldown: Arc<std::sync::OnceLock<WalLagCooldown>>,
+
    // Cancellation token fires when we have entered shutdown().  This is a parent of
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,
@@ -1000,11 +1004,15 @@ impl Tenant {
                // Remote preload is complete.
                drop(remote_load_completion);

+
                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
+                let attach_start = std::time::Instant::now();
                let attached = {
                    let _attach_timer = Some(TENANT.attach.start_timer());
                    tenant_clone.attach(preload, &ctx).await
                };
+                let attach_duration = attach_start.elapsed();
+                _ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration));

                match attached {
                    Ok(()) => {
@@ -2754,6 +2762,7 @@ impl Tenant {
            pg_version,
            state,
            last_aux_file_policy,
+            self.attach_wal_lag_cooldown.clone(),
            self.cancel.child_token(),
        );

@@ -2860,6 +2869,7 @@ impl Tenant {
                Some(Duration::from_secs(3600 * 24)),
            )),
            activate_now_sem: tokio::sync::Semaphore::new(0),
+            attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            timeline_get_throttle: Arc::new(throttle::Throttle::new(
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -48,7 +48,6 @@ use utils::{
    sync::gate::{Gate, GateGuard},
 };

-use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
@@ -62,6 +61,7 @@ use std::{
    collections::btree_map::Entry,
    ops::{Deref, Range},
 };
+use std::{pin::pin, sync::OnceLock};

 use crate::{
    aux_file::AuxFileSizeEstimator,
@@ -71,6 +71,7 @@ use crate::{
        metadata::TimelineMetadata,
        storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
    },
+    walingest::WalLagCooldown,
    walredo,
 };
 use crate::{
@@ -429,6 +430,8 @@ pub struct Timeline {
    pub(crate) l0_flush_global_state: L0FlushGlobalState,

    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
+
+    pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
 }

 pub struct WalReceiverInfo {
@@ -2131,6 +2134,7 @@ impl Timeline {
        pg_version: u32,
        state: TimelineState,
        aux_file_policy: Option<AuxFilePolicy>,
+        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2272,6 +2276,8 @@ impl Timeline {
                l0_flush_global_state: resources.l0_flush_global_state,

                handles: Default::default(),
+
+                attach_wal_lag_cooldown,
            };

            if aux_file_policy == Some(AuxFilePolicy::V1) {
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,7 +21,10 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

+use std::sync::Arc;
+use std::sync::OnceLock;
 use std::time::Duration;
+use std::time::Instant;
 use std::time::SystemTime;

 use pageserver_api::shard::ShardIdentity;
@@ -69,7 +72,29 @@ impl CheckPoint {
    }
 }

+/// Temporary limitation of WAL lag warnings after attach
+///
+/// After tenant attach, we want to limit WAL lag warnings because
+/// we don't look at the WAL until the attach is complete, which
+/// might take a while.
+pub struct WalLagCooldown {
+    /// Until when should this limitation apply at all
+    active_until: std::time::Instant,
+    /// The maximum lag to suppress. Lags above this limit get reported anyways.
+    max_lag: Duration,
+}
+
+impl WalLagCooldown {
+    pub fn new(attach_start: Instant, attach_duration: Duration) -> Self {
+        Self {
+            active_until: attach_start + attach_duration * 3 + Duration::from_secs(120),
+            max_lag: attach_duration * 2 + Duration::from_secs(60),
+        }
+    }
+}
+
 pub struct WalIngest {
+    attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
    shard: ShardIdentity,
    checkpoint: CheckPoint,
    checkpoint_modified: bool,
@@ -103,6 +128,7 @@ impl WalIngest {
            shard: *timeline.get_shard_identity(),
            checkpoint,
            checkpoint_modified: false,
+            attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(),
            warn_ingest_lag: WarnIngestLag {
                lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
                future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
@@ -1429,6 +1455,13 @@ impl WalIngest {
                    Ok(lag) => {
                        if lag > conf.wait_lsn_timeout {
                            rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
+                                if let Some(cooldown) = self.attach_wal_lag_cooldown.get() {
+                                    if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag {
+                                        return;
+                                    }
+                                } else {
+                                    // Still loading? We shouldn't be here
+                                }
                                let lag = humantime::format_duration(lag);
                                warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                            })