pageserver: gate previous heatmap behind config flag (#11088)

## Problem On unarchival, we update the previous heatmap with all visible layers. When the primary generates a new heatmap it includes all those layers, so the secondary will download them. Since they're not actually resident on the primary (we didn't call the warm up API), they'll never be evicted, so they remain in the heatmap. This leads to oversized secondary locations like we saw in pre-prod. ## Summary of changes Gate the loading of the previous heatmaps and the heatmap generation on unarchival behind configuration flags. They are disabled by default, but enabled in tests.
2026-01-03 19:42:55 +00:00 · 2025-03-05 12:20:18 +00:00
parent abae7637d6
commit 8c12ccf729
4 changed files with 29 additions and 3 deletions
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -123,6 +123,10 @@ pub struct ConfigToml {
    pub enable_read_path_debugging: Option<bool>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub validate_wal_contiguity: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub load_previous_heatmap: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generate_unarchival_heatmap: Option<bool>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -523,6 +527,8 @@ impl Default for ConfigToml {
                None
            },
            validate_wal_contiguity: None,
+            load_previous_heatmap: None,
+            generate_unarchival_heatmap: None,
        }
    }
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -194,6 +194,13 @@ pub struct PageServerConf {
    /// Interpreted protocol feature: if enabled, validate that the logical WAL received from
    /// safekeepers does not have gaps.
    pub validate_wal_contiguity: bool,
+
+    /// When set, the previously written to disk heatmap is loaded on tenant attach and used
+    /// to avoid clobbering the heatmap from new, cold, attached locations.
+    pub load_previous_heatmap: bool,
+
+    /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
+    pub generate_unarchival_heatmap: bool,
 }

 /// Token for authentication to safekeepers
@@ -358,6 +365,8 @@ impl PageServerConf {
            get_vectored_concurrent_io,
            enable_read_path_debugging,
            validate_wal_contiguity,
+            load_previous_heatmap,
+            generate_unarchival_heatmap,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -447,6 +456,8 @@ impl PageServerConf {
            no_sync: no_sync.unwrap_or(false),
            enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
            validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
+            load_previous_heatmap: load_previous_heatmap.unwrap_or(false),
+            generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(false),
        };

        // ------------------------------------------------------------
@@ -493,6 +504,8 @@ impl PageServerConf {
            metric_collection_interval: Duration::from_secs(60),
            synthetic_size_calculation_interval: Duration::from_secs(60),
            background_task_maximum_delay: Duration::ZERO,
+            load_previous_heatmap: Some(true),
+            generate_unarchival_heatmap: Some(true),
            ..Default::default()
        };
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1150,7 +1150,7 @@ impl Tenant {
        // a previous heatmap which contains all visible layers in the layer map.
        // This previous heatmap will be used whenever a fresh heatmap is generated
        // for the timeline.
-        if matches!(cause, LoadTimelineCause::Unoffload) {
+        if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
            let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
            while let Some((tline, end_lsn)) = tline_ending_at {
                let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
@@ -1582,6 +1582,10 @@ impl Tenant {
    }

    async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
+        if !self.conf.load_previous_heatmap {
+            return None;
+        }
+
        let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
        match tokio::fs::read_to_string(on_disk_heatmap_path).await {
            Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1169,6 +1169,8 @@ class NeonEnv:
                # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
                # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                "no_sync": True,
+                # Look for gaps in WAL received from safekeepeers
+                "validate_wal_contiguity": True,
            }

            # Batching (https://github.com/neondatabase/neon/issues/9377):
@@ -1181,11 +1183,12 @@ class NeonEnv:

            if config.test_may_use_compatibility_snapshot_binaries:
                log.info(
-                    "Skipping WAL contiguity validation to avoid forward-compatibility related test failures"
+                    "Skipping prev heatmap settings to avoid forward-compatibility related test failures"
                )
            else:
                # Look for gaps in WAL received from safekeepeers
-                ps_cfg["validate_wal_contiguity"] = True
+                ps_cfg["load_previous_heatmap"] = True
+                ps_cfg["generate_unarchival_heatmap"] = True

            get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
            if get_vectored_concurrent_io is not None: