From 8c12ccf7291b435bd022bae39b3ea1cd5cced670 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 5 Mar 2025 12:20:18 +0000
Subject: [PATCH] pageserver: gate previous heatmap behind config flag (#11088)

## Problem

On unarchival, we update the previous heatmap with all visible layers.
When the primary generates a new heatmap it includes all those layers,
so the secondary will download them. Since they're not actually resident
on the primary (we didn't call the warm up API), they'll never be
evicted, so they remain in the heatmap.

This leads to oversized secondary locations like we saw in pre-prod.

## Summary of changes

Gate the loading of the previous heatmaps and the heatmap generation on
unarchival behind configuration
flags. They are disabled by default, but enabled in tests.
---
 libs/pageserver_api/src/config.rs     |  6 ++++++
 pageserver/src/config.rs              | 13 +++++++++++++
 pageserver/src/tenant.rs              |  6 +++++-
 test_runner/fixtures/neon_fixtures.py |  7 +++++--
 4 files changed, 29 insertions(+), 3 deletions(-)
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 039cc1319e..f387ff0579 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -123,6 +123,10 @@ pub struct ConfigToml {
     pub enable_read_path_debugging: Option<bool>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub validate_wal_contiguity: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub load_previous_heatmap: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generate_unarchival_heatmap: Option<bool>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -523,6 +527,8 @@ impl Default for ConfigToml {
                 None
             },
             validate_wal_contiguity: None,
+            load_previous_heatmap: None,
+            generate_unarchival_heatmap: None,
         }
     }
 }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 64d00882b9..33ae8c4790 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -194,6 +194,13 @@ pub struct PageServerConf {
     /// Interpreted protocol feature: if enabled, validate that the logical WAL received from
     /// safekeepers does not have gaps.
     pub validate_wal_contiguity: bool,
+
+    /// When set, the previously written to disk heatmap is loaded on tenant attach and used
+    /// to avoid clobbering the heatmap from new, cold, attached locations.
+    pub load_previous_heatmap: bool,
+
+    /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
+    pub generate_unarchival_heatmap: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -358,6 +365,8 @@ impl PageServerConf {
             get_vectored_concurrent_io,
             enable_read_path_debugging,
             validate_wal_contiguity,
+            load_previous_heatmap,
+            generate_unarchival_heatmap,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -447,6 +456,8 @@ impl PageServerConf {
             no_sync: no_sync.unwrap_or(false),
             enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
             validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
+            load_previous_heatmap: load_previous_heatmap.unwrap_or(false),
+            generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(false),
         };
 
         // ------------------------------------------------------------
@@ -493,6 +504,8 @@ impl PageServerConf {
             metric_collection_interval: Duration::from_secs(60),
             synthetic_size_calculation_interval: Duration::from_secs(60),
             background_task_maximum_delay: Duration::ZERO,
+            load_previous_heatmap: Some(true),
+            generate_unarchival_heatmap: Some(true),
             ..Default::default()
         };
         PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index fee007b2d7..3694381078 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1150,7 +1150,7 @@ impl Tenant {
         // a previous heatmap which contains all visible layers in the layer map.
         // This previous heatmap will be used whenever a fresh heatmap is generated
         // for the timeline.
-        if matches!(cause, LoadTimelineCause::Unoffload) {
+        if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
             let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
             while let Some((tline, end_lsn)) = tline_ending_at {
                 let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
@@ -1582,6 +1582,10 @@ impl Tenant {
     }
 
     async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
+        if !self.conf.load_previous_heatmap {
+            return None;
+        }
+
         let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
         match tokio::fs::read_to_string(on_disk_heatmap_path).await {
             Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3aa018e99e..6171da52a0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1169,6 +1169,8 @@ class NeonEnv:
                 # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
                 # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                 "no_sync": True,
+                # Look for gaps in WAL received from safekeepeers
+                "validate_wal_contiguity": True,
             }
 
             # Batching (https://github.com/neondatabase/neon/issues/9377):
@@ -1181,11 +1183,12 @@ class NeonEnv:
 
             if config.test_may_use_compatibility_snapshot_binaries:
                 log.info(
-                    "Skipping WAL contiguity validation to avoid forward-compatibility related test failures"
+                    "Skipping prev heatmap settings to avoid forward-compatibility related test failures"
                 )
             else:
                 # Look for gaps in WAL received from safekeepeers
-                ps_cfg["validate_wal_contiguity"] = True
+                ps_cfg["load_previous_heatmap"] = True
+                ps_cfg["generate_unarchival_heatmap"] = True
 
             get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
             if get_vectored_concurrent_io is not None: