From 2d8f6d7906f622c5a4f911609a52c2a00738d1e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 3 Oct 2024 03:33:09 +0200
Subject: [PATCH] Suppress wal lag timeout warnings right after tenant
 attachment (#9232)

As seen in https://github.com/neondatabase/cloud/issues/17335, during
releases we can have ingest lags that are above the limits for warnings.
However, such lags are part of normal pageserver startup.

Therefore, calculate a certain cooldown timestamp until which we accept
lags up to a certain size. The heuristic is chosen to grow the later we get
to fully load the tenant, and we also add 60 seconds as a grace period
after that term.
---
 pageserver/src/tenant.rs          | 10 ++++++++++
 pageserver/src/tenant/timeline.rs |  8 +++++++-
 pageserver/src/walingest.rs       | 33 +++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index db88303f7b..29f682c62a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -97,6 +97,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
+use crate::walingest::WalLagCooldown;
 use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
@@ -319,6 +320,9 @@ pub struct Tenant {
     /// background warmup.
     pub(crate) activate_now_sem: tokio::sync::Semaphore,
 
+    /// Time it took for the tenant to activate. Zero if not active yet.
+    attach_wal_lag_cooldown: Arc<std::sync::OnceLock<WalLagCooldown>>,
+
     // Cancellation token fires when we have entered shutdown().  This is a parent of
     // Timelines' cancellation token.
     pub(crate) cancel: CancellationToken,
@@ -1000,11 +1004,15 @@ impl Tenant {
                 // Remote preload is complete.
                 drop(remote_load_completion);
 
+
                 // We will time the duration of the attach phase unless this is a creation (attach will do no work)
+                let attach_start = std::time::Instant::now();
                 let attached = {
                     let _attach_timer = Some(TENANT.attach.start_timer());
                     tenant_clone.attach(preload, &ctx).await
                 };
+                let attach_duration = attach_start.elapsed();
+                _ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration));
 
                 match attached {
                     Ok(()) => {
@@ -2754,6 +2762,7 @@ impl Tenant {
             pg_version,
             state,
             last_aux_file_policy,
+            self.attach_wal_lag_cooldown.clone(),
             self.cancel.child_token(),
         );
 
@@ -2860,6 +2869,7 @@ impl Tenant {
                 Some(Duration::from_secs(3600 * 24)),
             )),
             activate_now_sem: tokio::sync::Semaphore::new(0),
+            attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
             timeline_get_throttle: Arc::new(throttle::Throttle::new(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1438627080..1d79b2b74b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -48,7 +48,6 @@ use utils::{
     sync::gate::{Gate, GateGuard},
 };
 
-use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
@@ -62,6 +61,7 @@ use std::{
     collections::btree_map::Entry,
     ops::{Deref, Range},
 };
+use std::{pin::pin, sync::OnceLock};
 
 use crate::{
     aux_file::AuxFileSizeEstimator,
@@ -71,6 +71,7 @@ use crate::{
         metadata::TimelineMetadata,
         storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
     },
+    walingest::WalLagCooldown,
     walredo,
 };
 use crate::{
@@ -429,6 +430,8 @@ pub struct Timeline {
     pub(crate) l0_flush_global_state: L0FlushGlobalState,
 
     pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
+
+    pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
 }
 
 pub struct WalReceiverInfo {
@@ -2131,6 +2134,7 @@ impl Timeline {
         pg_version: u32,
         state: TimelineState,
         aux_file_policy: Option<AuxFilePolicy>,
+        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2272,6 +2276,8 @@ impl Timeline {
                 l0_flush_global_state: resources.l0_flush_global_state,
 
                 handles: Default::default(),
+
+                attach_wal_lag_cooldown,
             };
 
             if aux_file_policy == Some(AuxFilePolicy::V1) {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 229c01a681..95d1f76920 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,7 +21,10 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
+use std::sync::Arc;
+use std::sync::OnceLock;
 use std::time::Duration;
+use std::time::Instant;
 use std::time::SystemTime;
 
 use pageserver_api::shard::ShardIdentity;
@@ -69,7 +72,29 @@ impl CheckPoint {
     }
 }
 
+/// Temporary limitation of WAL lag warnings after attach
+///
+/// After tenant attach, we want to limit WAL lag warnings because
+/// we don't look at the WAL until the attach is complete, which
+/// might take a while.
+pub struct WalLagCooldown {
+    /// Until when should this limitation apply at all
+    active_until: std::time::Instant,
+    /// The maximum lag to suppress. Lags above this limit get reported anyways.
+    max_lag: Duration,
+}
+
+impl WalLagCooldown {
+    pub fn new(attach_start: Instant, attach_duration: Duration) -> Self {
+        Self {
+            active_until: attach_start + attach_duration * 3 + Duration::from_secs(120),
+            max_lag: attach_duration * 2 + Duration::from_secs(60),
+        }
+    }
+}
+
 pub struct WalIngest {
+    attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
     shard: ShardIdentity,
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
@@ -103,6 +128,7 @@ impl WalIngest {
             shard: *timeline.get_shard_identity(),
             checkpoint,
             checkpoint_modified: false,
+            attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(),
             warn_ingest_lag: WarnIngestLag {
                 lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
                 future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
@@ -1429,6 +1455,13 @@ impl WalIngest {
                     Ok(lag) => {
                         if lag > conf.wait_lsn_timeout {
                             rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
+                                if let Some(cooldown) = self.attach_wal_lag_cooldown.get() {
+                                    if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag {
+                                        return;
+                                    }
+                                } else {
+                                    // Still loading? We shouldn't be here
+                                }
                                 let lag = humantime::format_duration(lag);
                                 warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
                             })