test: Less flaky gc (#4416)

Solves a flaky test error in the wild[^1] by:

- Make the gc shutdown signal reading an `allowed_error`
- Note the gc shutdown signal readings as being in `allowed_error`s
- Allow passing tenant conf to init_start to avoid unncessary tenants

[^1]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4399/5176432780/index.html#suites/b97efae3a617afb71cb8142f5afa5224/2cd76021ea011f93
This commit is contained in:
Joonas Koivunen
2023-06-05 15:43:52 +03:00
committed by GitHub
parent b9871158ba
commit 8142edda01
5 changed files with 9 additions and 14 deletions

View File

@@ -1395,6 +1395,7 @@ impl Tenant {
pitr: Duration,
ctx: &RequestContext,
) -> anyhow::Result<GcResult> {
// there is a global allowed_error for this
anyhow::ensure!(
self.is_active(),
"Cannot run GC iteration on inactive tenant"

View File

@@ -3749,6 +3749,7 @@ impl Timeline {
// Is the timeline being deleted?
let state = *self.state.borrow();
if state == TimelineState::Stopping {
// there's a global allowed_error for this
anyhow::bail!("timeline is Stopping");
}

View File

@@ -629,7 +629,7 @@ class NeonEnvBuilder:
assert self.env is not None, "environment is not already initialized, call init() first"
self.env.start()
def init_start(self) -> NeonEnv:
def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
env = self.init_configs()
self.start()
@@ -638,7 +638,9 @@ class NeonEnvBuilder:
log.info(
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
)
initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
initial_tenant, initial_timeline = env.neon_cli.create_tenant(
tenant_id=env.initial_tenant, conf=initial_tenant_conf
)
env.initial_timeline = initial_timeline
log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
@@ -1613,6 +1615,7 @@ class NeonPageserver(PgProtocol):
".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress

View File

@@ -58,11 +58,8 @@ def test_ondemand_download_large_rel(
)
##### First start, insert secret data and upload it to the remote storage
env = neon_env_builder.init_start()
# Override defaults, to create more layers
tenant, _ = env.neon_cli.create_tenant(
conf={
env = neon_env_builder.init_start(
initial_tenant_conf={
# disable background GC
"gc_period": "0s",
"gc_horizon": f"{10 * 1024 ** 3}", # 10 GB
@@ -75,7 +72,6 @@ def test_ondemand_download_large_rel(
"compaction_period": "0s",
}
)
env.initial_tenant = tenant
endpoint = env.endpoints.create_start("main")

View File

@@ -17,12 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
n_restarts = 10
scale = 10
# the background task may complete the init task delay after finding an
# active tenant, but shutdown starts right before Tenant::gc_iteration
env.pageserver.allowed_errors.append(
r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant"
)
def run_pgbench(connstr: str):
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])