From fc4ea3553ebed97a11503e970f77196f7d39a4ca Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 21 Oct 2022 02:39:55 +0300 Subject: [PATCH] test_gc_cutoff.py fixes (#2655) * Fix bogus early exit from GC. Commit 91411c415a added this failpoint, but the early exit was not intentional. * Cleanup test_gc_cutoff.py test. - Remove the 'scale' parameter, this isn't a benchmark - Tweak pgbench and pageserver options to create garbage faster that the the GC can collect away. The test used to take just under 5 minutes, which was uncomfortably close to the default 5 minute test timeout, and annoyingly even without the hard limit. These changes bring it down to about 1-2 minutes. - Improve comments, fix typos - Rename the failpoint. The old name, 'gc-before-save-metadata' implied that the failpoint was before the metadata update, but it was in fact much later in the function. - Move the call to persist the metadata outside the lock, to avoid holding it for too long. To verify that this test still covers the original bug, https://github.com/neondatabase/neon/issues/2539, I commenting out updating the metadata file like this: ``` diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1e857a9a..f8a9f34a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1962,7 +1962,7 @@ impl Timeline { } // Persist the new GC cutoff value in the metadata file, before // we actually remove anything. - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; + //self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; info!("GC starting"); ``` It doesn't fail every time with that, but it did fail after about 5 runs. --- pageserver/src/tenant/timeline.rs | 15 ++++++--------- test_runner/regress/test_gc_cutoff.py | 27 ++++++++++++++------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8f325d31ec..a0ac0adea2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1984,10 +1984,10 @@ impl Timeline { new_gc_cutoff ); write_guard.store_and_unlock(new_gc_cutoff).wait(); - - // Persist metadata file - self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; } + // Persist the new GC cutoff value in the metadata file, before + // we actually remove anything. + self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?; info!("GC starting"); @@ -2114,15 +2114,12 @@ impl Timeline { } info!( - "GC completed removing {} layers, cuttof {}", + "GC completed removing {} layers, cutoff {}", result.layers_removed, new_gc_cutoff ); + if result.layers_removed != 0 { - fail_point!("gc-before-save-metadata", |_| { - info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point"); - std::process::abort(); - }); - return Ok(result); + fail_point!("after-timeline-gc-removed-layers"); } if self.upload_layers.load(atomic::Ordering::Relaxed) { diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py index 946c689a30..22b77d2cf1 100644 --- a/test_runner/regress/test_gc_cutoff.py +++ b/test_runner/regress/test_gc_cutoff.py @@ -1,14 +1,13 @@ -import pytest from fixtures.neon_fixtures import NeonEnvBuilder, PgBin -from performance.test_perf_pgbench import get_scales_matrix -# Test gc_cuttoff +# Test gc_cutoff # -# This test set fail point after at the end of GC and checks -# that pageserver normally restarts after it -@pytest.mark.parametrize("scale", get_scales_matrix(10)) -def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int): +# This test sets fail point at the end of GC, and checks that pageserver +# normally restarts after it. Also, there should be GC ERRORs in the log, +# but the fixture checks the log for any unexpected ERRORs after every +# test anyway, so it doesn't need any special attention here. +def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -18,21 +17,23 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int): "gc_period": "10 s", "gc_horizon": f"{1024 ** 2}", "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", + "compaction_period": "5 s", # set PITR interval to be small, so we can do GC "pitr_interval": "1 s", + "compaction_threshold": "3", + "image_creation_threshold": "2", } ) pg = env.postgres.create_start("main", tenant_id=tenant_id) - connstr = pg.connstr() - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + connstr = pg.connstr(options="-csynchronous_commit=off") + pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) - pageserver_http.configure_failpoints(("gc-before-save-metadata", "return")) + pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) for i in range(5): try: - pg_bin.run_capture(["pgbench", "-T100", connstr]) + pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr]) except Exception: env.pageserver.stop() env.pageserver.start() - pageserver_http.configure_failpoints(("gc-before-save-metadata", "return")) + pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))