pagesever: include visible layers in heatmaps after unarchival (#10880)

## Problem https://github.com/neondatabase/neon/pull/10788 introduced an API for warming up attached locations by downloading all layers in the heatmap. We intend to use it for warming up timelines after unarchival too, but it doesn't work. Any heatmap generated after the unarchival will not include our timeline, so we've lost all those layers. ## Summary of changes Generate a cheeky heatmap on unarchival. It includes all the visible layers. Use that as the `PreviousHeatmap` which inputs into actual heatmap generation. Closes: https://github.com/neondatabase/neon/issues/10541
2026-05-30 19:40:39 +00:00 · 2025-02-24 15:21:17 +00:00
parent 17724a19e6
commit 459446fcb8
5 changed files with 168 additions and 19 deletions
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -8,9 +8,10 @@ from pathlib import Path
 from typing import TYPE_CHECKING

 import pytest
-from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    DEFAULT_BRANCH_NAME,
    NeonEnvBuilder,
    NeonPageserver,
    StorageControllerMigrationConfig,
@@ -927,8 +928,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
    workload.write_rows(128, upload=True)
    workload.write_rows(128, upload=True)
    workload.write_rows(128, upload=True)
+
+    child_timeline_id = env.create_branch(
+        "foo", tenant_id, ancestor_branch_name=DEFAULT_BRANCH_NAME
+    )
+
    workload.write_rows(128, upload=True)
-    workload.stop()

    # Expect lots of layers
    assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10
@@ -937,9 +942,19 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
    for ps in env.pageservers:
        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])

+    def timeline_heatmap(tlid):
+        assert env.pageserver_remote_storage is not None
+
+        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
+        for htl in heatmap["timelines"]:
+            if htl["timeline_id"] == str(tlid):
+                return htl
+
+        raise RuntimeError(f"No heatmap for timeline: {tlid}")
+
    # Upload a heatmap, so that secondaries have something to download
    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
-    heatmap_before_migration = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_before_migration = timeline_heatmap(timeline_id)

    # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms.
    # However, it pulls the heatmap, which will be important later.
@@ -971,17 +986,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
    assert env.storage_controller.locate(tenant_id)[0]["node_id"] == ps_secondary.id

    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
-    heatmap_after_migration = env.pageserver_remote_storage.heatmap_content(tenant_id)
+    heatmap_after_migration = timeline_heatmap(timeline_id)

-    assert len(heatmap_before_migration["timelines"][0]["layers"]) > 0
+    assert len(heatmap_before_migration["layers"]) > 0

-    # The new layer map should contain all the layers in the pre-migration one
-    # and a new in memory layer
-    after_migration_heatmap_layers_count = len(heatmap_after_migration["timelines"][0]["layers"])
-    assert (
-        len(heatmap_before_migration["timelines"][0]["layers"]) + 1
-        == after_migration_heatmap_layers_count
-    )
+    after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"])
+    assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count

    log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")

@@ -989,10 +999,71 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
        TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
    )

-    def all_layers_downloaded():
+    # Now simulate the case where a child timeline is archived, parent layers
+    # are evicted and the child is unarchived. When the child is unarchived,
+    # itself and the parent update their heatmaps to contain layers needed by the
+    # child. One can warm up the timeline hierarchy since the heatmaps are ready.
+
+    def all_layers_downloaded(expected_layer_count: int):
        local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id))

        log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}")
-        assert local_layers_count == after_migration_heatmap_layers_count
+        assert local_layers_count >= expected_layer_count

-    wait_until(all_layers_downloaded)
+    wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count))
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+
+    before = (
+        ps_secondary.http_client()
+        .get_metrics()
+        .query_one("pageserver_remote_ondemand_downloaded_layers_total")
+        .value
+    )
+    workload.validate()
+    after = (
+        ps_secondary.http_client()
+        .get_metrics()
+        .query_one("pageserver_remote_ondemand_downloaded_layers_total")
+        .value
+    )
+
+    workload.stop()
+    assert before == after
+
+    def check_archival_state(state: TimelineArchivalState, tline):
+        timelines = (
+            timeline["timeline_id"]
+            for timeline in ps_secondary.http_client().timeline_list(tenant_id=tenant_id)
+        )
+
+        if state == TimelineArchivalState.ARCHIVED:
+            assert str(tline) not in timelines
+        elif state == TimelineArchivalState.UNARCHIVED:
+            assert str(tline) in timelines
+
+    ps_secondary.http_client().timeline_archival_config(
+        tenant_id, child_timeline_id, TimelineArchivalState.ARCHIVED
+    )
+    ps_secondary.http_client().timeline_offload(tenant_id, child_timeline_id)
+    wait_until(lambda: check_archival_state(TimelineArchivalState.ARCHIVED, child_timeline_id))
+
+    ps_secondary.http_client().evict_all_layers(tenant_id, timeline_id)
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+    assert len(timeline_heatmap(timeline_id)["layers"]) == 0
+
+    ps_secondary.http_client().timeline_archival_config(
+        tenant_id, child_timeline_id, TimelineArchivalState.UNARCHIVED
+    )
+    wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id))
+
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+    log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}")
+    log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}")
+
+    expected_locally = len(timeline_heatmap(timeline_id)["layers"])
+    assert expected_locally > 0
+
+    env.storage_controller.download_heatmap_layers(
+        TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
+    )
+    wait_until(lambda: all_layers_downloaded(expected_locally))