From 2e5eab69c6161bfbf380df355f1ab195171d8601 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 2 Feb 2024 18:20:18 +0000
Subject: [PATCH] tests: remove test_gc_cutoff (#6587)

This test became flaky when postgres retry handling was fixed to use
backoff delays -- each iteration in this test's loop was taking much
longer because pgbench doesn't fail until postgres has given up on
retrying to the pageserver.

We are just removing it, because the condition it tests is no longer
risky: we reload all metadata from remote storage on restart, so
crashing directly between making local changes and doing remote uploads
isn't interesting any more.

Closes:  https://github.com/neondatabase/neon/issues/2856
Closes: https://github.com/neondatabase/neon/issues/5329
---
 pageserver/src/tenant/timeline.rs     |  4 ---
 test_runner/regress/test_gc_cutoff.py | 47 ---------------------------
 2 files changed, 51 deletions(-)
 delete mode 100644 test_runner/regress/test_gc_cutoff.py

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e779f6f32e..0ffe0b6418 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4388,10 +4388,6 @@ impl Timeline {
 
             guard.finish_gc_timeline(&gc_layers);
 
-            if result.layers_removed != 0 {
-                fail_point!("after-timeline-gc-removed-layers");
-            }
-
             #[cfg(feature = "testing")]
             {
                 result.doomed_layers = gc_layers;
diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py
deleted file mode 100644
index 284a8c3563..0000000000
--- a/test_runner/regress/test_gc_cutoff.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import subprocess
-
-import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-
-
-# Test gc_cutoff
-#
-# This test sets fail point at the end of GC, and checks that pageserver
-# normally restarts after it. Also, there should be GC ERRORs in the log,
-# but the fixture checks the log for any unexpected ERRORs after every
-# test anyway, so it doesn't need any special attention here.
-@pytest.mark.timeout(600)
-def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "gc_period": "10 s",
-            "gc_horizon": f"{1024 ** 2}",
-            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_period": "5 s",
-            # set PITR interval to be small, so we can do GC
-            "pitr_interval": "1 s",
-            "compaction_threshold": "3",
-            "image_creation_threshold": "2",
-        }
-    )
-
-    pageserver_http = env.pageserver.http_client()
-
-    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
-    tenant_id = env.initial_tenant
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-    connstr = endpoint.connstr(options="-csynchronous_commit=off")
-    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
-
-    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
-
-    # Because this test does a rapid series of restarts of the same node, it's possible that
-    # we are restarted again before we can clean up deletion lists form the previous generation,
-    # resulting in a subsequent startup logging a warning.
-    env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
-
-    for _ in range(5):
-        with pytest.raises(subprocess.SubprocessError):
-            pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
-        env.pageserver.stop()
-        env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})