test_gc_cutoff.py fixes (#2655)

* Fix bogus early exit from GC.

Commit 91411c415a added this failpoint, but the early exit was not
intentional.

* Cleanup test_gc_cutoff.py test.

- Remove the 'scale' parameter, this isn't a benchmark
- Tweak pgbench and pageserver options to create garbage faster that the
  the GC can collect away. The test used to take just under 5 minutes,
  which was uncomfortably close to the default 5 minute test timeout, and
  annoyingly even without the hard limit. These changes bring it down to
  about 1-2 minutes.
- Improve comments, fix typos
- Rename the failpoint. The old name, 'gc-before-save-metadata' implied
  that the failpoint was before the metadata update, but it was in fact
  much later in the function.
- Move the call to persist the metadata outside the lock, to avoid
  holding it for too long.

To verify that this test still covers the original bug,
https://github.com/neondatabase/neon/issues/2539, I commenting out
updating the metadata file like this:
```
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1e857a9a..f8a9f34a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1962,7 +1962,7 @@ impl Timeline {
         }
         // Persist the new GC cutoff value in the metadata file, before
         // we actually remove anything.
-        self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+        //self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;

         info!("GC starting");

```
It doesn't fail every time with that, but it did fail after about 5
runs.
This commit is contained in:
Heikki Linnakangas
2022-10-21 02:39:55 +03:00
committed by GitHub
parent cca1ace651
commit fc4ea3553e
2 changed files with 20 additions and 22 deletions

View File

@@ -1984,10 +1984,10 @@ impl Timeline {
new_gc_cutoff
);
write_guard.store_and_unlock(new_gc_cutoff).wait();
// Persist metadata file
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
}
// Persist the new GC cutoff value in the metadata file, before
// we actually remove anything.
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
info!("GC starting");
@@ -2114,15 +2114,12 @@ impl Timeline {
}
info!(
"GC completed removing {} layers, cuttof {}",
"GC completed removing {} layers, cutoff {}",
result.layers_removed, new_gc_cutoff
);
if result.layers_removed != 0 {
fail_point!("gc-before-save-metadata", |_| {
info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point");
std::process::abort();
});
return Ok(result);
fail_point!("after-timeline-gc-removed-layers");
}
if self.upload_layers.load(atomic::Ordering::Relaxed) {

View File

@@ -1,14 +1,13 @@
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
from performance.test_perf_pgbench import get_scales_matrix
# Test gc_cuttoff
# Test gc_cutoff
#
# This test set fail point after at the end of GC and checks
# that pageserver normally restarts after it
@pytest.mark.parametrize("scale", get_scales_matrix(10))
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
# This test sets fail point at the end of GC, and checks that pageserver
# normally restarts after it. Also, there should be GC ERRORs in the log,
# but the fixture checks the log for any unexpected ERRORs after every
# test anyway, so it doesn't need any special attention here.
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
@@ -18,21 +17,23 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
"gc_period": "10 s",
"gc_horizon": f"{1024 ** 2}",
"checkpoint_distance": f"{1024 ** 2}",
"compaction_target_size": f"{1024 ** 2}",
"compaction_period": "5 s",
# set PITR interval to be small, so we can do GC
"pitr_interval": "1 s",
"compaction_threshold": "3",
"image_creation_threshold": "2",
}
)
pg = env.postgres.create_start("main", tenant_id=tenant_id)
connstr = pg.connstr()
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
connstr = pg.connstr(options="-csynchronous_commit=off")
pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
for i in range(5):
try:
pg_bin.run_capture(["pgbench", "-T100", connstr])
pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
except Exception:
env.pageserver.stop()
env.pageserver.start()
pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))