ondemand_download_large_rel: solve flakyness (#3697)

Disable background tasks to not get compaction downloading all layers but also stop safekeepers before checkpointing, use a readonly endpoint. Fixes: #3666 Co-authored-by: Christian Schwarz <christian@neon.tech>
2026-01-07 05:22:56 +00:00 · 2023-05-17 17:19:02 +03:00
parent 9767432cff
commit 918cd25453
1 changed files with 19 additions and 4 deletions
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -64,12 +64,15 @@ def test_ondemand_download_large_rel(
    tenant, _ = env.neon_cli.create_tenant(
        conf={
            # disable background GC
-            "gc_period": "10 m",
+            "gc_period": "0s",
            "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
            # small checkpoint distance to create more delta layer files
            "checkpoint_distance": f"{10 * 1024 ** 2}",  # 10 MB
+            # allow compaction with the checkpoint
            "compaction_threshold": "3",
            "compaction_target_size": f"{10 * 1024 ** 2}",  # 10 MB
+            # but don't run compaction in background or on restart
+            "compaction_period": "0s",
        }
    )
    env.initial_tenant = tenant
@@ -96,9 +99,17 @@ def test_ondemand_download_large_rel(

        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

-    # wait until pageserver receives that data
    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)

+    # stop endpoint before checkpoint to stop wal generation
+    endpoint.stop()
+
+    # stopping of safekeepers now will help us not to calculate logical size
+    # after startup, so page requests should be the only one on-demand
+    # downloading the layers
+    for sk in env.safekeepers:
+        sk.stop()
+
    # run checkpoint manually to be sure that data landed in remote storage
    client.timeline_checkpoint(tenant_id, timeline_id)

@@ -107,7 +118,6 @@ def test_ondemand_download_large_rel(
    log.info("uploads have finished")

    ##### Stop the first pageserver instance, erase all its data
-    endpoint.stop()
    env.pageserver.stop()

    # remove all the layer files
@@ -118,8 +128,13 @@ def test_ondemand_download_large_rel(
    ##### Second start, restore the data and ensure it's the same
    env.pageserver.start()

-    endpoint.start()
+    # start a readonly endpoint which we'll use to check the database.
+    # readonly (with lsn=) is required so that we don't try to connect to
+    # safekeepers, that have now been shut down.
+    endpoint = env.endpoints.create_start("main", lsn=current_lsn)
+
    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    assert before_downloads != 0, "basebackup should on-demand non-zero layers"

    # Probe in the middle of the table. There's a high chance that the beginning
    # and end of the table was stored together in the same layer files with data