From 8142edda0166318ad8f868584657905ddcfa17be Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 5 Jun 2023 15:43:52 +0300 Subject: [PATCH] test: Less flaky gc (#4416) Solves a flaky test error in the wild[^1] by: - Make the gc shutdown signal reading an `allowed_error` - Note the gc shutdown signal readings as being in `allowed_error`s - Allow passing tenant conf to init_start to avoid unncessary tenants [^1]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4399/5176432780/index.html#suites/b97efae3a617afb71cb8142f5afa5224/2cd76021ea011f93 --- pageserver/src/tenant.rs | 1 + pageserver/src/tenant/timeline.rs | 1 + test_runner/fixtures/neon_fixtures.py | 7 +++++-- test_runner/regress/test_ondemand_download.py | 8 ++------ .../regress/test_pageserver_restarts_under_workload.py | 6 ------ 5 files changed, 9 insertions(+), 14 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a895b57092..bcf4495ac2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1395,6 +1395,7 @@ impl Tenant { pitr: Duration, ctx: &RequestContext, ) -> anyhow::Result { + // there is a global allowed_error for this anyhow::ensure!( self.is_active(), "Cannot run GC iteration on inactive tenant" diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 3db78401f6..fdaad58e16 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3749,6 +3749,7 @@ impl Timeline { // Is the timeline being deleted? let state = *self.state.borrow(); if state == TimelineState::Stopping { + // there's a global allowed_error for this anyhow::bail!("timeline is Stopping"); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1007cb11b5..5017c8dcd3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -629,7 +629,7 @@ class NeonEnvBuilder: assert self.env is not None, "environment is not already initialized, call init() first" self.env.start() - def init_start(self) -> NeonEnv: + def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv: env = self.init_configs() self.start() @@ -638,7 +638,9 @@ class NeonEnvBuilder: log.info( f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" ) - initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant) + initial_tenant, initial_timeline = env.neon_cli.create_tenant( + tenant_id=env.initial_tenant, conf=initial_tenant_conf + ) env.initial_timeline = initial_timeline log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully") @@ -1613,6 +1615,7 @@ class NeonPageserver(PgProtocol): ".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed ".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs + ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 1414b4ed8e..c26ec76172 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -58,11 +58,8 @@ def test_ondemand_download_large_rel( ) ##### First start, insert secret data and upload it to the remote storage - env = neon_env_builder.init_start() - - # Override defaults, to create more layers - tenant, _ = env.neon_cli.create_tenant( - conf={ + env = neon_env_builder.init_start( + initial_tenant_conf={ # disable background GC "gc_period": "0s", "gc_horizon": f"{10 * 1024 ** 3}", # 10 GB @@ -75,7 +72,6 @@ def test_ondemand_download_large_rel( "compaction_period": "0s", } ) - env.initial_tenant = tenant endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index bc3f3f2be4..fc93dcffbb 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -17,12 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB n_restarts = 10 scale = 10 - # the background task may complete the init task delay after finding an - # active tenant, but shutdown starts right before Tenant::gc_iteration - env.pageserver.allowed_errors.append( - r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant" - ) - def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])