ondemand_download_large_rel: solve flakyness (#3697)

Disable background tasks to not get compaction downloading all layers
but also stop safekeepers before checkpointing, use a readonly endpoint.

Fixes: #3666

Co-authored-by: Christian Schwarz <christian@neon.tech>
This commit is contained in:
Joonas Koivunen
2023-05-17 17:19:02 +03:00
committed by GitHub
parent 9767432cff
commit 918cd25453

View File

@@ -64,12 +64,15 @@ def test_ondemand_download_large_rel(
tenant, _ = env.neon_cli.create_tenant(
conf={
# disable background GC
"gc_period": "10 m",
"gc_period": "0s",
"gc_horizon": f"{10 * 1024 ** 3}", # 10 GB
# small checkpoint distance to create more delta layer files
"checkpoint_distance": f"{10 * 1024 ** 2}", # 10 MB
# allow compaction with the checkpoint
"compaction_threshold": "3",
"compaction_target_size": f"{10 * 1024 ** 2}", # 10 MB
# but don't run compaction in background or on restart
"compaction_period": "0s",
}
)
env.initial_tenant = tenant
@@ -96,9 +99,17 @@ def test_ondemand_download_large_rel(
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
# wait until pageserver receives that data
wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)
# stop endpoint before checkpoint to stop wal generation
endpoint.stop()
# stopping of safekeepers now will help us not to calculate logical size
# after startup, so page requests should be the only one on-demand
# downloading the layers
for sk in env.safekeepers:
sk.stop()
# run checkpoint manually to be sure that data landed in remote storage
client.timeline_checkpoint(tenant_id, timeline_id)
@@ -107,7 +118,6 @@ def test_ondemand_download_large_rel(
log.info("uploads have finished")
##### Stop the first pageserver instance, erase all its data
endpoint.stop()
env.pageserver.stop()
# remove all the layer files
@@ -118,8 +128,13 @@ def test_ondemand_download_large_rel(
##### Second start, restore the data and ensure it's the same
env.pageserver.start()
endpoint.start()
# start a readonly endpoint which we'll use to check the database.
# readonly (with lsn=) is required so that we don't try to connect to
# safekeepers, that have now been shut down.
endpoint = env.endpoints.create_start("main", lsn=current_lsn)
before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
assert before_downloads != 0, "basebackup should on-demand non-zero layers"
# Probe in the middle of the table. There's a high chance that the beginning
# and end of the table was stored together in the same layer files with data