mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
test_gc_cutoff.py fixes (#2655)
* Fix bogus early exit from GC.
Commit 91411c415a added this failpoint, but the early exit was not
intentional.
* Cleanup test_gc_cutoff.py test.
- Remove the 'scale' parameter, this isn't a benchmark
- Tweak pgbench and pageserver options to create garbage faster that the
the GC can collect away. The test used to take just under 5 minutes,
which was uncomfortably close to the default 5 minute test timeout, and
annoyingly even without the hard limit. These changes bring it down to
about 1-2 minutes.
- Improve comments, fix typos
- Rename the failpoint. The old name, 'gc-before-save-metadata' implied
that the failpoint was before the metadata update, but it was in fact
much later in the function.
- Move the call to persist the metadata outside the lock, to avoid
holding it for too long.
To verify that this test still covers the original bug,
https://github.com/neondatabase/neon/issues/2539, I commenting out
updating the metadata file like this:
```
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1e857a9a..f8a9f34a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1962,7 +1962,7 @@ impl Timeline {
}
// Persist the new GC cutoff value in the metadata file, before
// we actually remove anything.
- self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
+ //self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
info!("GC starting");
```
It doesn't fail every time with that, but it did fail after about 5
runs.
This commit is contained in:
committed by
GitHub
parent
cca1ace651
commit
fc4ea3553e
@@ -1984,10 +1984,10 @@ impl Timeline {
|
||||
new_gc_cutoff
|
||||
);
|
||||
write_guard.store_and_unlock(new_gc_cutoff).wait();
|
||||
|
||||
// Persist metadata file
|
||||
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
|
||||
}
|
||||
// Persist the new GC cutoff value in the metadata file, before
|
||||
// we actually remove anything.
|
||||
self.update_metadata_file(self.disk_consistent_lsn.load(), HashMap::new())?;
|
||||
|
||||
info!("GC starting");
|
||||
|
||||
@@ -2114,15 +2114,12 @@ impl Timeline {
|
||||
}
|
||||
|
||||
info!(
|
||||
"GC completed removing {} layers, cuttof {}",
|
||||
"GC completed removing {} layers, cutoff {}",
|
||||
result.layers_removed, new_gc_cutoff
|
||||
);
|
||||
|
||||
if result.layers_removed != 0 {
|
||||
fail_point!("gc-before-save-metadata", |_| {
|
||||
info!("Abnormaly terinate pageserver at gc-before-save-metadata fail point");
|
||||
std::process::abort();
|
||||
});
|
||||
return Ok(result);
|
||||
fail_point!("after-timeline-gc-removed-layers");
|
||||
}
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
|
||||
from performance.test_perf_pgbench import get_scales_matrix
|
||||
|
||||
|
||||
# Test gc_cuttoff
|
||||
# Test gc_cutoff
|
||||
#
|
||||
# This test set fail point after at the end of GC and checks
|
||||
# that pageserver normally restarts after it
|
||||
@pytest.mark.parametrize("scale", get_scales_matrix(10))
|
||||
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
|
||||
# This test sets fail point at the end of GC, and checks that pageserver
|
||||
# normally restarts after it. Also, there should be GC ERRORs in the log,
|
||||
# but the fixture checks the log for any unexpected ERRORs after every
|
||||
# test anyway, so it doesn't need any special attention here.
|
||||
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
@@ -18,21 +17,23 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, scale: int):
|
||||
"gc_period": "10 s",
|
||||
"gc_horizon": f"{1024 ** 2}",
|
||||
"checkpoint_distance": f"{1024 ** 2}",
|
||||
"compaction_target_size": f"{1024 ** 2}",
|
||||
"compaction_period": "5 s",
|
||||
# set PITR interval to be small, so we can do GC
|
||||
"pitr_interval": "1 s",
|
||||
"compaction_threshold": "3",
|
||||
"image_creation_threshold": "2",
|
||||
}
|
||||
)
|
||||
pg = env.postgres.create_start("main", tenant_id=tenant_id)
|
||||
connstr = pg.connstr()
|
||||
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
|
||||
connstr = pg.connstr(options="-csynchronous_commit=off")
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
|
||||
|
||||
pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
|
||||
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
|
||||
|
||||
for i in range(5):
|
||||
try:
|
||||
pg_bin.run_capture(["pgbench", "-T100", connstr])
|
||||
pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
|
||||
except Exception:
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
pageserver_http.configure_failpoints(("gc-before-save-metadata", "return"))
|
||||
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
|
||||
|
||||
Reference in New Issue
Block a user