From 51a43b121c0409ab49f443c1a0f93645199a50bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 26 Feb 2024 13:21:40 +0100 Subject: [PATCH] Fix test_remote_storage_upload_queue_retries flakiness (#6898) * decreases checkpointing and compaction targets for even more layer files * write 10 thousand rows 2 times instead of writing 20 thousand rows 1 time so that there is more to GC. Before it was noisily jumping between 1 and 0 layer files, now it's jumping between 19 and 20 layer files. The 0 caused an assertion error that gave the test most of its flakiness. * larger timeout for the churn while failpoints are active thread: this is mostly so that the test is more robust on systems with more load Fixes #3051 --- test_runner/regress/test_remote_storage.py | 37 ++++++++++++---------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 95f912ccc5..176a5e57dc 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -228,9 +228,9 @@ def test_remote_storage_upload_queue_retries( tenant_id, timeline_id = env.neon_cli.create_tenant( conf={ # small checkpointing and compaction targets to ensure we generate many upload operations - "checkpoint_distance": f"{128 * 1024}", + "checkpoint_distance": f"{64 * 1024}", "compaction_threshold": "1", - "compaction_target_size": f"{128 * 1024}", + "compaction_target_size": f"{64 * 1024}", # no PITR horizon, we specify the horizon when we request on-demand GC "pitr_interval": "0s", # disable background compaction and GC. We invoke it manually when we want it to happen. @@ -256,21 +256,24 @@ def test_remote_storage_upload_queue_retries( ] ) + FOO_ROWS_COUNT = 4000 + def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data): # create initial set of layers & upload them with failpoints configured - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 20000) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) + for _v in range(2): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, {FOO_ROWS_COUNT}) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) def get_queued_count(file_kind, op_kind): @@ -333,7 +336,7 @@ def test_remote_storage_upload_queue_retries( # The churn thread doesn't make progress once it blocks on the first wait_completion() call, # so, give it some time to wrap up. - churn_while_failpoints_active_thread.join(30) + churn_while_failpoints_active_thread.join(60) assert not churn_while_failpoints_active_thread.is_alive() assert churn_thread_result[0] @@ -365,7 +368,7 @@ def test_remote_storage_upload_queue_retries( log.info("restarting postgres to validate") endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) with endpoint.cursor() as cur: - assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000 + assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT def test_remote_timeline_client_calls_started_metric(