From 918cd25453e8b57b11eb2ef36d07de73a4d3a9c4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 17 May 2023 17:19:02 +0300
Subject: [PATCH] ondemand_download_large_rel: solve flakyness (#3697)

Disable background tasks to not get compaction downloading all layers
but also stop safekeepers before checkpointing, use a readonly endpoint.

Fixes: #3666

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 test_runner/regress/test_ondemand_download.py | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 31f6c1f3d9..1414b4ed8e 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -64,12 +64,15 @@ def test_ondemand_download_large_rel(
     tenant, _ = env.neon_cli.create_tenant(
         conf={
             # disable background GC
-            "gc_period": "10 m",
+            "gc_period": "0s",
             "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
             # small checkpoint distance to create more delta layer files
             "checkpoint_distance": f"{10 * 1024 ** 2}",  # 10 MB
+            # allow compaction with the checkpoint
             "compaction_threshold": "3",
             "compaction_target_size": f"{10 * 1024 ** 2}",  # 10 MB
+            # but don't run compaction in background or on restart
+            "compaction_period": "0s",
         }
     )
     env.initial_tenant = tenant
@@ -96,9 +99,17 @@ def test_ondemand_download_large_rel(
 
         current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
-    # wait until pageserver receives that data
     wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
 
+    # stop endpoint before checkpoint to stop wal generation
+    endpoint.stop()
+
+    # stopping of safekeepers now will help us not to calculate logical size
+    # after startup, so page requests should be the only one on-demand
+    # downloading the layers
+    for sk in env.safekeepers:
+        sk.stop()
+
     # run checkpoint manually to be sure that data landed in remote storage
     client.timeline_checkpoint(tenant_id, timeline_id)
 
@@ -107,7 +118,6 @@ def test_ondemand_download_large_rel(
     log.info("uploads have finished")
 
     ##### Stop the first pageserver instance, erase all its data
-    endpoint.stop()
     env.pageserver.stop()
 
     # remove all the layer files
@@ -118,8 +128,13 @@ def test_ondemand_download_large_rel(
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    endpoint.start()
+    # start a readonly endpoint which we'll use to check the database.
+    # readonly (with lsn=) is required so that we don't try to connect to
+    # safekeepers, that have now been shut down.
+    endpoint = env.endpoints.create_start("main", lsn=current_lsn)
+
     before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    assert before_downloads != 0, "basebackup should on-demand non-zero layers"
 
     # Probe in the middle of the table. There's a high chance that the beginning
     # and end of the table was stored together in the same layer files with data