From 0e00faf52891f36db29b471a5a7ad89b8a123b21 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Apr 2025 17:31:23 +0100
Subject: [PATCH] tests: stability fixes for `test_migration_to_cold_secondary`
 (#11606)

1. Compute may generate WAL on shutdown. The test assumes that after
shutdown,
no further ingest happens. Tweak the compute shutdown to make the
assumption true.
2. Assertion of local layer count post cold migration is not right since
we may have downloaded
layers due to ingest. Remove it.

Closes https://github.com/neondatabase/neon/issues/11587
---
 test_runner/regress/test_pageserver_secondary.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index d48e731394..3aa0c63979 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -14,6 +14,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     NeonPageserver,
     StorageControllerMigrationConfig,
+    flush_ep_to_pageserver,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
@@ -997,10 +998,6 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
     heatmap_after_migration = timeline_heatmap(timeline_id)
 
-    local_layers = ps_secondary.list_layers(tenant_id, timeline_id)
-    # We download 1 layer per second and give up within 5 seconds.
-    assert len(local_layers) < 10
-
     after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"])
     log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")
 
@@ -1038,9 +1035,14 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
         .value
     )
 
-    workload.stop()
     assert before == after
 
+    # Stop the endpoint and wait until any finally written WAL propagates to
+    # the pageserver and is uploaded to remote storage.
+    flush_ep_to_pageserver(env, workload.endpoint(), tenant_id, timeline_id)
+    ps_secondary.http_client().timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    workload.stop()
+
     # Now simulate the case where a child timeline is archived, parent layers
     # are evicted and the child is unarchived. When the child is unarchived,
     # itself and the parent update their heatmaps to contain layers needed by the