From fc4ea3553ebed97a11503e970f77196f7d39a4ca Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 21 Oct 2022 02:39:55 +0300
Subject: [PATCH] test_gc_cutoff.py fixes (#2655)

* Fix bogus early exit from GC.

Commit 91411c415a added this failpoint, but the early exit was not
intentional.

* Cleanup test_gc_cutoff.py test.

- Remove the 'scale' parameter, this isn't a benchmark
- Tweak pgbench and pageserver options to create garbage faster that the
  the GC can collect away. The test used to take just under 5 minutes,
  which was uncomfortably close to the default 5 minute test timeout, and
  annoyingly even without the hard limit. These changes bring it down to
  about 1-2 minutes.
- Improve comments, fix typos
- Rename the failpoint. The old name, 'gc-before-save-metadata' implied
  that the failpoint was before the metadata update, but it was in fact
  much later in the function.
- Move the call to persist the metadata outside the lock, to avoid
  holding it for too long.

To verify that this test still covers the original bug,
https://github.com/neondatabase/neon/issues/2539, I commenting out
updating the metadata file like this:
```
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1e857a9a..f8a9f34a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1962,7 +1962,7 @@ impl Timeline {
         }
         // Persist the new GC cutoff value in the metadata file, before
         // we actually remove anything.
-        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+        //self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;

         info!("GC starting");

```
It doesn't fail every time with that, but it did fail after about 5
runs.
---
 pageserver/src/tenant/timeline.rs     | 15 ++++++---------
 test_runner/regress/test_gc_cutoff.py | 27 ++++++++++++++-------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f325d31ec..a0ac0adea2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1984,10 +1984,10 @@ impl Timeline {
                 new_gc_cutoff
             );
             write_guard.store_and_unlock(new_gc_cutoff).wait();
-
-            // Persist metadata file
-            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
         }
+        // Persist the new GC cutoff value in the metadata file, before
+        // we actually remove anything.
+        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
 
         info!("GC starting");
 
@@ -2114,15 +2114,12 @@ impl Timeline {
         }
 
         info!(
-            "GC completed removing {} layers, cuttof {}",
+            "GC completed removing {} layers, cutoff {}",
             result.layers_removed, new_gc_cutoff
         );
+
         if result.layers_removed != 0 {
-            fail_point!("gc-before-save-metadata", |_| {
-                info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point");
-                std::process::abort();
-            });
-            return Ok(result);
+            fail_point!("after-timeline-gc-removed-layers");
         }
 
         if self.upload_layers.load(atomic::Ordering::Relaxed) {
diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py
index 946c689a30..22b77d2cf1 100644
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -1,14 +1,13 @@
-import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-from performance.test_perf_pgbench import get_scales_matrix
 
 
-# Test gc_cuttoff
+# Test gc_cutoff
 #
-# This test set fail point after at the end of GC and checks
-# that pageserver normally restarts after it
-@pytest.mark.parametrize("scale", get_scales_matrix(10))
-def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
+# This test sets fail point at the end of GC, and checks that pageserver
+# normally restarts after it. Also, there should be GC ERRORs in the log,
+# but the fixture checks the log for any unexpected ERRORs after every
+# test anyway, so it doesn't need any special attention here.
+def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
@@ -18,21 +17,23 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
             "gc_period": "10 s",
             "gc_horizon": f"{1024 ** 2}",
             "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_target_size": f"{1024 ** 2}",
+            "compaction_period": "5 s",
             # set PITR interval to be small, so we can do GC
             "pitr_interval": "1 s",
+            "compaction_threshold": "3",
+            "image_creation_threshold": "2",
         }
     )
     pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    connstr = pg.connstr()
-    pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+    connstr = pg.connstr(options="-csynchronous_commit=off")
+    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
 
-    pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
+    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
 
     for i in range(5):
         try:
-            pg_bin.run_capture(["pgbench", "-T100", connstr])
+            pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
         except Exception:
             env.pageserver.stop()
             env.pageserver.start()
-            pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
+            pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))