fix(tests): improve test_scrubber_tenant_snapshot stability (#11471)

## Problem
`test_scrubber_tenant_snapshot` is flaky with `request was dropped`
errors. More details are in the issue.
- Closes: https://github.com/neondatabase/neon/issues/11278

## Summary of changes
- Disable shard scheduling during pageservers restart
- Add `reconcile_until_idle` in the end of the test
This commit is contained in:
Dmitrii Kovalkov
2025-04-08 14:03:38 +04:00
committed by GitHub
parent 8a6d0dccaa
commit 7791a49dd4
2 changed files with 16 additions and 2 deletions

View File

@@ -7270,7 +7270,7 @@ impl Service {
}
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
// dirty, spawn another rone
// dirty, spawn another one
if self
.maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
.is_some()

View File

@@ -75,7 +75,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
# Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable
# is it won't overlap with migrations
# as it won't overlap with migrations
env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
output_path = neon_env_builder.test_output_dir / "snapshot"
@@ -87,6 +87,13 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
workload.stop()
# Disable scheduling, so the storage controller doesn't migrate shards around
# while we are stopping pageservers
env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"})
env.storage_controller.allowed_errors.extend(
[".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"]
)
# Stop pageservers
for pageserver in env.pageservers:
pageserver.stop()
@@ -127,9 +134,16 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
for pageserver in env.pageservers:
pageserver.start()
# Turn scheduling back on.
# We don't care about optimizations, so enable only essential scheduling
env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"})
# Check we can read everything
workload.validate()
# Reconcile to avoid a race between test shutdown and background reconciliation (#11278)
env.storage_controller.reconcile_until_idle()
def drop_local_state(env: NeonEnv, tenant_id: TenantId):
env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})