From 918cd25453e8b57b11eb2ef36d07de73a4d3a9c4 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 17 May 2023 17:19:02 +0300 Subject: [PATCH] ondemand_download_large_rel: solve flakyness (#3697) Disable background tasks to not get compaction downloading all layers but also stop safekeepers before checkpointing, use a readonly endpoint. Fixes: #3666 Co-authored-by: Christian Schwarz --- test_runner/regress/test_ondemand_download.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 31f6c1f3d9..1414b4ed8e 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -64,12 +64,15 @@ def test_ondemand_download_large_rel( tenant, _ = env.neon_cli.create_tenant( conf={ # disable background GC - "gc_period": "10 m", + "gc_period": "0s", "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB + # allow compaction with the checkpoint "compaction_threshold": "3", "compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB + # but don't run compaction in background or on restart + "compaction_period": "0s", } ) env.initial_tenant = tenant @@ -96,9 +99,17 @@ def test_ondemand_download_large_rel( current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - # wait until pageserver receives that data wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # stop endpoint before checkpoint to stop wal generation + endpoint.stop() + + # stopping of safekeepers now will help us not to calculate logical size + # after startup, so page requests should be the only one on-demand + # downloading the layers + for sk in env.safekeepers: + sk.stop() + # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) @@ -107,7 +118,6 @@ def test_ondemand_download_large_rel( log.info("uploads have finished") ##### Stop the first pageserver instance, erase all its data - endpoint.stop() env.pageserver.stop() # remove all the layer files @@ -118,8 +128,13 @@ def test_ondemand_download_large_rel( ##### Second start, restore the data and ensure it's the same env.pageserver.start() - endpoint.start() + # start a readonly endpoint which we'll use to check the database. + # readonly (with lsn=) is required so that we don't try to connect to + # safekeepers, that have now been shut down. + endpoint = env.endpoints.create_start("main", lsn=current_lsn) + before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + assert before_downloads != 0, "basebackup should on-demand non-zero layers" # Probe in the middle of the table. There's a high chance that the beginning # and end of the table was stored together in the same layer files with data