tests: remove test_gc_cutoff (#6587)

This test became flaky when postgres retry handling was fixed to use
backoff delays -- each iteration in this test's loop was taking much
longer because pgbench doesn't fail until postgres has given up on
retrying to the pageserver.

We are just removing it, because the condition it tests is no longer
risky: we reload all metadata from remote storage on restart, so
crashing directly between making local changes and doing remote uploads
isn't interesting any more.

Closes:  https://github.com/neondatabase/neon/issues/2856
Closes: https://github.com/neondatabase/neon/issues/5329
This commit is contained in:
John Spray
2024-02-02 18:20:18 +00:00
committed by GitHub
parent caf868e274
commit 2e5eab69c6
2 changed files with 0 additions and 51 deletions

View File

@@ -4388,10 +4388,6 @@ impl Timeline {
guard.finish_gc_timeline(&gc_layers);
if result.layers_removed != 0 {
fail_point!("after-timeline-gc-removed-layers");
}
#[cfg(feature = "testing")]
{
result.doomed_layers = gc_layers;

View File

@@ -1,47 +0,0 @@
import subprocess
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
# Test gc_cutoff
#
# This test sets fail point at the end of GC, and checks that pageserver
# normally restarts after it. Also, there should be GC ERRORs in the log,
# but the fixture checks the log for any unexpected ERRORs after every
# test anyway, so it doesn't need any special attention here.
@pytest.mark.timeout(600)
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start(
initial_tenant_conf={
"gc_period": "10 s",
"gc_horizon": f"{1024 ** 2}",
"checkpoint_distance": f"{1024 ** 2}",
"compaction_period": "5 s",
# set PITR interval to be small, so we can do GC
"pitr_interval": "1 s",
"compaction_threshold": "3",
"image_creation_threshold": "2",
}
)
pageserver_http = env.pageserver.http_client()
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
tenant_id = env.initial_tenant
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
connstr = endpoint.connstr(options="-csynchronous_commit=off")
pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
# Because this test does a rapid series of restarts of the same node, it's possible that
# we are restarted again before we can clean up deletion lists form the previous generation,
# resulting in a subsequent startup logging a warning.
env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
for _ in range(5):
with pytest.raises(subprocess.SubprocessError):
pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
env.pageserver.stop()
env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})