mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
test: Less flaky gc (#4416)
Solves a flaky test error in the wild[^1] by: - Make the gc shutdown signal reading an `allowed_error` - Note the gc shutdown signal readings as being in `allowed_error`s - Allow passing tenant conf to init_start to avoid unncessary tenants [^1]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4399/5176432780/index.html#suites/b97efae3a617afb71cb8142f5afa5224/2cd76021ea011f93
This commit is contained in:
@@ -1395,6 +1395,7 @@ impl Tenant {
|
||||
pitr: Duration,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<GcResult> {
|
||||
// there is a global allowed_error for this
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot run GC iteration on inactive tenant"
|
||||
|
||||
@@ -3749,6 +3749,7 @@ impl Timeline {
|
||||
// Is the timeline being deleted?
|
||||
let state = *self.state.borrow();
|
||||
if state == TimelineState::Stopping {
|
||||
// there's a global allowed_error for this
|
||||
anyhow::bail!("timeline is Stopping");
|
||||
}
|
||||
|
||||
|
||||
@@ -629,7 +629,7 @@ class NeonEnvBuilder:
|
||||
assert self.env is not None, "environment is not already initialized, call init() first"
|
||||
self.env.start()
|
||||
|
||||
def init_start(self) -> NeonEnv:
|
||||
def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
|
||||
env = self.init_configs()
|
||||
self.start()
|
||||
|
||||
@@ -638,7 +638,9 @@ class NeonEnvBuilder:
|
||||
log.info(
|
||||
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
|
||||
)
|
||||
initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
|
||||
initial_tenant, initial_timeline = env.neon_cli.create_tenant(
|
||||
tenant_id=env.initial_tenant, conf=initial_tenant_conf
|
||||
)
|
||||
env.initial_timeline = initial_timeline
|
||||
log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
|
||||
|
||||
@@ -1613,6 +1615,7 @@ class NeonPageserver(PgProtocol):
|
||||
".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed
|
||||
".*wait for layer upload ops to complete.*", # .*Caused by:.*wait_completion aborted because upload queue was stopped
|
||||
".*gc_loop.*Gc failed, retrying in.*timeline is Stopping", # When gc checks timeline state after acquiring layer_removal_cs
|
||||
".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant", # Tenant::gc precondition
|
||||
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
|
||||
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
|
||||
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
|
||||
|
||||
@@ -58,11 +58,8 @@ def test_ondemand_download_large_rel(
|
||||
)
|
||||
|
||||
##### First start, insert secret data and upload it to the remote storage
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Override defaults, to create more layers
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
# disable background GC
|
||||
"gc_period": "0s",
|
||||
"gc_horizon": f"{10 * 1024 ** 3}", # 10 GB
|
||||
@@ -75,7 +72,6 @@ def test_ondemand_download_large_rel(
|
||||
"compaction_period": "0s",
|
||||
}
|
||||
)
|
||||
env.initial_tenant = tenant
|
||||
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
|
||||
@@ -17,12 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
|
||||
n_restarts = 10
|
||||
scale = 10
|
||||
|
||||
# the background task may complete the init task delay after finding an
|
||||
# active tenant, but shutdown starts right before Tenant::gc_iteration
|
||||
env.pageserver.allowed_errors.append(
|
||||
r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant"
|
||||
)
|
||||
|
||||
def run_pgbench(connstr: str):
|
||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
|
||||
|
||||
Reference in New Issue
Block a user