mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 12:32:54 +00:00
fix(tests): improve test_scrubber_tenant_snapshot stability (#11471)
## Problem `test_scrubber_tenant_snapshot` is flaky with `request was dropped` errors. More details are in the issue. - Closes: https://github.com/neondatabase/neon/issues/11278 ## Summary of changes - Disable shard scheduling during pageservers restart - Add `reconcile_until_idle` in the end of the test
This commit is contained in:
@@ -7270,7 +7270,7 @@ impl Service {
|
||||
}
|
||||
|
||||
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
|
||||
// dirty, spawn another rone
|
||||
// dirty, spawn another one
|
||||
if self
|
||||
.maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
|
||||
.is_some()
|
||||
|
||||
@@ -75,7 +75,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
|
||||
tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
|
||||
|
||||
# Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable
|
||||
# is it won't overlap with migrations
|
||||
# as it won't overlap with migrations
|
||||
env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
|
||||
|
||||
output_path = neon_env_builder.test_output_dir / "snapshot"
|
||||
@@ -87,6 +87,13 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
|
||||
|
||||
workload.stop()
|
||||
|
||||
# Disable scheduling, so the storage controller doesn't migrate shards around
|
||||
# while we are stopping pageservers
|
||||
env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"})
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"]
|
||||
)
|
||||
|
||||
# Stop pageservers
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.stop()
|
||||
@@ -127,9 +134,16 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
|
||||
for pageserver in env.pageservers:
|
||||
pageserver.start()
|
||||
|
||||
# Turn scheduling back on.
|
||||
# We don't care about optimizations, so enable only essential scheduling
|
||||
env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"})
|
||||
|
||||
# Check we can read everything
|
||||
workload.validate()
|
||||
|
||||
# Reconcile to avoid a race between test shutdown and background reconciliation (#11278)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
|
||||
def drop_local_state(env: NeonEnv, tenant_id: TenantId):
|
||||
env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
|
||||
|
||||
Reference in New Issue
Block a user