test_gc_cutoff.py fixes (#2655)

* Fix bogus early exit from GC. Commit 91411c415a added this failpoint, but the early exit was not intentional. * Cleanup test_gc_cutoff.py test. - Remove the 'scale' parameter, this isn't a benchmark - Tweak pgbench and pageserver options to create garbage faster that the the GC can collect away. The test used to take just under 5 minutes, which was uncomfortably close to the default 5 minute test timeout, and annoyingly even without the hard limit. These changes bring it down to about 1-2 minutes. - Improve comments, fix typos - Rename the failpoint. The old name, 'gc-before-save-metadata' implied that the failpoint was before the metadata update, but it was in fact much later in the function. - Move the call to persist the metadata outside the lock, to avoid holding it for too long. To verify that this test still covers the original bug, https://github.com/neondatabase/neon/issues/2539, I commenting out updating the metadata file like this: ``` diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1e857a9a..f8a9f34a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1962,7 +1962,7 @@ impl Timeline { } // Persist the new GC cutoff value in the metadata file, before // we actually remove anything. - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; + //self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; info!("GC starting"); ``` It doesn't fail every time with that, but it did fail after about 5 runs.
2026-01-08 22:12:56 +00:00 · 2022-10-21 02:39:55 +03:00
parent cca1ace651
commit fc4ea3553e
2 changed files with 20 additions and 22 deletions
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1984,10 +1984,10 @@ impl Timeline {
                new_gc_cutoff
            );
            write_guard.store_and_unlock(new_gc_cutoff).wait();
-
-            // Persist metadata file
-            self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
        }
+        // Persist the new GC cutoff value in the metadata file, before
+        // we actually remove anything.
+        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;

        info!("GC starting");

@@ -2114,15 +2114,12 @@ impl Timeline {
        }

        info!(
-            "GC completed removing {} layers, cuttof {}",
+            "GC completed removing {} layers, cutoff {}",
            result.layers_removed, new_gc_cutoff
        );
+
        if result.layers_removed != 0 {
-            fail_point!("gc-before-save-metadata", |_| {
-                info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point");
-                std::process::abort();
-            });
-            return Ok(result);
+            fail_point!("after-timeline-gc-removed-layers");
        }

        if self.upload_layers.load(atomic::Ordering::Relaxed) {
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -1,14 +1,13 @@
-import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
-from performance.test_perf_pgbench import get_scales_matrix


-# Test gc_cuttoff
+# Test gc_cutoff
 #
-# This test set fail point after at the end of GC and checks
-# that pageserver normally restarts after it
-@pytest.mark.parametrize("scale", get_scales_matrix(10))
-def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
+# This test sets fail point at the end of GC, and checks that pageserver
+# normally restarts after it. Also, there should be GC ERRORs in the log,
+# but the fixture checks the log for any unexpected ERRORs after every
+# test anyway, so it doesn't need any special attention here.
+def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

@@ -18,21 +17,23 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
            "gc_period": "10 s",
            "gc_horizon": f"{1024 ** 2}",
            "checkpoint_distance": f"{1024 ** 2}",
-            "compaction_target_size": f"{1024 ** 2}",
+            "compaction_period": "5 s",
            # set PITR interval to be small, so we can do GC
            "pitr_interval": "1 s",
+            "compaction_threshold": "3",
+            "image_creation_threshold": "2",
        }
    )
    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    connstr = pg.connstr()
-    pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+    connstr = pg.connstr(options="-csynchronous_commit=off")
+    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])

-    pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
+    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))

    for i in range(5):
        try:
-            pg_bin.run_capture(["pgbench", "-T100", connstr])
+            pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
        except Exception:
            env.pageserver.stop()
            env.pageserver.start()
-            pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
+            pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))