tests: further stabilize test_deletion_queue_recovery (#7335)

This is the other main failure mode called out in #6092 , that the test can shut down the pageserver while it has "future layers" in the index, and that this results in unexpected stats after restart. We can avoid this nondeterminism by shutting down the endpoint, flushing everything from SK to PS, checkpointing, and then waiting for that final LSN to be uploaded. This is more heavyweight than most of our tests require, but useful in the case of tests that expect a particular behavior after restart wrt layer deletions.
2026-01-09 14:32:57 +00:00 · 2024-04-07 22:21:18 +01:00
parent 74b2314a5d
commit 0788760451
2 changed files with 21 additions and 1 deletions
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -22,6 +22,7 @@ from fixtures.neon_fixtures import (
    NeonPageserver,
    PgBin,
    S3Scrubber,
+    flush_ep_to_pageserver,
    last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -30,6 +31,7 @@ from fixtures.pageserver.utils import (
    list_prefix,
    wait_for_last_record_lsn,
    wait_for_upload,
+    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import (
    RemoteStorageKind,
@@ -120,6 +122,17 @@ def generate_uploads_and_deletions(
        print_gc_result(gc_result)
        assert gc_result["layers_removed"] > 0

+        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
+        # ensures that the pageserver is in a fully idle state: there will be no more
+        # background ingest, no more uploads pending, and therefore no non-determinism
+        # in subsequent actions like pageserver restarts.
+        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        # Finish uploads
+        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
+        # Finish all remote writes (including deletions)
+        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
+

 def read_all(
    env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1187,7 +1187,14 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
    storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"])
    assert "Pause" in storcon_cli(["nodes"])[3]

-    # Make a node offline
+    # We will simulate a node death and then marking it offline
+    env.pageservers[0].stop(immediate=True)
+    # Sleep to make it unlikely that the controller's heartbeater will race handling
+    # a /utilization response internally, such that it marks the node back online.  IRL
+    # there would always be a longer delay than this before a node failing and a human
+    # intervening.
+    time.sleep(2)
+
    storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
    assert "Offline" in storcon_cli(["nodes"])[3]