From a221ecb0da7aedfdf5a9bb659add500024d29b22 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sat, 16 Sep 2023 18:27:19 +0300 Subject: [PATCH] test: test_download_remote_layers_api again (#5322) The test is still flaky, perhaps more after #5233, see #3831. Do one more `timeline_checkpoint` *after* shutting down safekeepers *before* shutting down pageserver. Put more effort into not compacting or creating image layers. --- test_runner/regress/test_ondemand_download.py | 46 ++++++++----------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 4041b8aab7..a38a517100 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -323,8 +323,8 @@ def test_download_remote_layers_api( "compaction_period": "0s", # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB - "compaction_threshold": "1", - "image_creation_threshold": "1", + "compaction_threshold": "999999", + "image_creation_threshold": "999999", "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB } ) @@ -357,8 +357,20 @@ def test_download_remote_layers_api( tenant_id, timeline_id, "pageserver_resident_physical_size" ) + # Shut down safekeepers before starting the pageserver. + # If we don't, they might stream us more WAL. + for sk in env.safekeepers: + sk.stop() + + # it is sad we cannot do a flush inmem layer without compaction, but + # working around with very high layer0 count and image layer creation + # threshold + client.timeline_checkpoint(tenant_id, timeline_id) + + wait_for_upload_queue_empty(client, tenant_id, timeline_id) + filled_current_physical = get_api_current_physical_size() - log.info(filled_current_physical) + log.info(f"filled_current_physical: {filled_current_physical}") filled_size = get_resident_physical_size() log.info(f"filled_size: {filled_size}") assert filled_current_physical == filled_size, "we don't yet do layer eviction" @@ -366,18 +378,10 @@ def test_download_remote_layers_api( env.pageserver.stop() # remove all the layer files - # XXX only delete some of the layer files, to show that it really just downloads all the layers for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"): log.info(f"unlinking layer {layer.name}") layer.unlink() - # Shut down safekeepers before starting the pageserver. - # If we don't, the tenant's walreceiver handler will trigger the - # the logical size computation task, and that downloads layes, - # which makes our assertions on size fail. - for sk in env.safekeepers: - sk.stop(immediate=True) - ##### Second start, restore the data and ensure it's the same env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"}) env.pageserver.allowed_errors.extend( @@ -391,32 +395,21 @@ def test_download_remote_layers_api( ###### Phase 1: exercise download error code path - # comparison here is requiring the size to be at least the previous size, because it's possible received WAL after last_flush_lsn_upload - # witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test. this_time = get_api_current_physical_size() assert ( - filled_current_physical <= this_time + filled_current_physical == this_time ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" - if filled_current_physical != this_time: - log.info( - f"fixing up filled_current_physical from {filled_current_physical} to {this_time} ({this_time - filled_current_physical})" - ) - filled_current_physical = this_time post_unlink_size = get_resident_physical_size() log.info(f"post_unlink_size: {post_unlink_size}") assert ( post_unlink_size < filled_size ), "we just deleted layers and didn't cause anything to re-download them yet" - assert filled_size - post_unlink_size > 5 * ( - 1024**2 - ), "we may be downloading some layers as part of tenant activation" # issue downloads that we know will fail info = client.timeline_download_remote_layers( tenant_id, timeline_id, - # allow some concurrency to unveil potential concurrency bugs max_concurrent_downloads=10, errors_ok=True, at_least_one_download=False, @@ -425,9 +418,9 @@ def test_download_remote_layers_api( assert info["state"] == "Completed" assert info["total_layer_count"] > 0 assert info["successful_download_count"] == 0 - assert ( - info["failed_download_count"] > 0 - ) # can't assert == total_layer_count because attach + tenant status downloads some layers + # can't assert == total_layer_count because timeline_detail also tries to + # download layers for logical size, but this might not always hold. + assert info["failed_download_count"] > 0 assert ( info["total_layer_count"] == info["successful_download_count"] + info["failed_download_count"] @@ -436,7 +429,6 @@ def test_download_remote_layers_api( assert ( get_resident_physical_size() == post_unlink_size ), "didn't download anything new due to failpoint" - # would be nice to assert that the layers in the layer map are still RemoteLayer ##### Retry, this time without failpoints client.configure_failpoints(("remote-storage-download-pre-rename", "off"))