From 51a43b121c0409ab49f443c1a0f93645199a50bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 26 Feb 2024 13:21:40 +0100
Subject: [PATCH] Fix test_remote_storage_upload_queue_retries flakiness
 (#6898)

* decreases checkpointing and compaction targets for even more layer
files
* write 10 thousand rows 2 times instead of writing 20 thousand rows 1
time so that there is more to GC. Before it was noisily jumping between
1 and 0 layer files, now it's jumping between 19 and 20 layer files. The
0 caused an assertion error that gave the test most of its flakiness.
* larger timeout for the churn while failpoints are active thread: this
is mostly so that the test is more robust on systems with more load

Fixes #3051
---
 test_runner/regress/test_remote_storage.py | 37 ++++++++++++----------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 95f912ccc5..176a5e57dc 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -228,9 +228,9 @@ def test_remote_storage_upload_queue_retries(
     tenant_id, timeline_id = env.neon_cli.create_tenant(
         conf={
             # small checkpointing and compaction targets to ensure we generate many upload operations
-            "checkpoint_distance": f"{128 * 1024}",
+            "checkpoint_distance": f"{64 * 1024}",
             "compaction_threshold": "1",
-            "compaction_target_size": f"{128 * 1024}",
+            "compaction_target_size": f"{64 * 1024}",
             # no PITR horizon, we specify the horizon when we request on-demand GC
             "pitr_interval": "0s",
             # disable background compaction and GC. We invoke it manually when we want it to happen.
@@ -256,21 +256,24 @@ def test_remote_storage_upload_queue_retries(
             ]
         )
 
+    FOO_ROWS_COUNT = 4000
+
     def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
         # create initial set of layers & upload them with failpoints configured
-        endpoint.safe_psql_many(
-            [
-                f"""
-               INSERT INTO foo (id, val)
-               SELECT g, '{data}'
-               FROM generate_series(1, 20000) g
-               ON CONFLICT (id) DO UPDATE
-               SET val = EXCLUDED.val
-               """,
-                # to ensure that GC can actually remove some layers
-                "VACUUM foo",
-            ]
-        )
+        for _v in range(2):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                    INSERT INTO foo (id, val)
+                    SELECT g, '{data}'
+                    FROM generate_series(1, {FOO_ROWS_COUNT}) g
+                    ON CONFLICT (id) DO UPDATE
+                    SET val = EXCLUDED.val
+                    """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     def get_queued_count(file_kind, op_kind):
@@ -333,7 +336,7 @@ def test_remote_storage_upload_queue_retries(
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
-    churn_while_failpoints_active_thread.join(30)
+    churn_while_failpoints_active_thread.join(60)
     assert not churn_while_failpoints_active_thread.is_alive()
     assert churn_thread_result[0]
 
@@ -365,7 +368,7 @@ def test_remote_storage_upload_queue_retries(
     log.info("restarting postgres to validate")
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT
 
 
 def test_remote_timeline_client_calls_started_metric(