Fix test_remote_storage_upload_queue_retries flakiness (#6898)

* decreases checkpointing and compaction targets for even more layer files * write 10 thousand rows 2 times instead of writing 20 thousand rows 1 time so that there is more to GC. Before it was noisily jumping between 1 and 0 layer files, now it's jumping between 19 and 20 layer files. The 0 caused an assertion error that gave the test most of its flakiness. * larger timeout for the churn while failpoints are active thread: this is mostly so that the test is more robust on systems with more load Fixes #3051
2026-01-08 22:12:56 +00:00 · 2024-02-26 13:21:40 +01:00
parent 256058f2ab
commit 51a43b121c
1 changed files with 20 additions and 17 deletions
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -228,9 +228,9 @@ def test_remote_storage_upload_queue_retries(
    tenant_id, timeline_id = env.neon_cli.create_tenant(
        conf={
            # small checkpointing and compaction targets to ensure we generate many upload operations
-            "checkpoint_distance": f"{128 * 1024}",
+            "checkpoint_distance": f"{64 * 1024}",
            "compaction_threshold": "1",
-            "compaction_target_size": f"{128 * 1024}",
+            "compaction_target_size": f"{64 * 1024}",
            # no PITR horizon, we specify the horizon when we request on-demand GC
            "pitr_interval": "0s",
            # disable background compaction and GC. We invoke it manually when we want it to happen.
@@ -256,21 +256,24 @@ def test_remote_storage_upload_queue_retries(
            ]
        )

+    FOO_ROWS_COUNT = 4000
+
    def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data):
        # create initial set of layers & upload them with failpoints configured
-        endpoint.safe_psql_many(
-            [
-                f"""
-               INSERT INTO foo (id, val)
-               SELECT g, '{data}'
-               FROM generate_series(1, 20000) g
-               ON CONFLICT (id) DO UPDATE
-               SET val = EXCLUDED.val
-               """,
-                # to ensure that GC can actually remove some layers
-                "VACUUM foo",
-            ]
-        )
+        for _v in range(2):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                    INSERT INTO foo (id, val)
+                    SELECT g, '{data}'
+                    FROM generate_series(1, {FOO_ROWS_COUNT}) g
+                    ON CONFLICT (id) DO UPDATE
+                    SET val = EXCLUDED.val
+                    """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    def get_queued_count(file_kind, op_kind):
@@ -333,7 +336,7 @@ def test_remote_storage_upload_queue_retries(

    # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
    # so, give it some time to wrap up.
-    churn_while_failpoints_active_thread.join(30)
+    churn_while_failpoints_active_thread.join(60)
    assert not churn_while_failpoints_active_thread.is_alive()
    assert churn_thread_result[0]

@@ -365,7 +368,7 @@ def test_remote_storage_upload_queue_retries(
    log.info("restarting postgres to validate")
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT


 def test_remote_timeline_client_calls_started_metric(