diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 50f642deaf..5e53051727 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -7270,7 +7270,7 @@ impl Service { } // Eventual consistency: if an earlier reconcile job failed, and the shard is still - // dirty, spawn another rone + // dirty, spawn another one if self .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) .is_some() diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 70af299de3..03cd133ccb 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -75,7 +75,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable - # is it won't overlap with migrations + # as it won't overlap with migrations env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) output_path = neon_env_builder.test_output_dir / "snapshot" @@ -87,6 +87,13 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: workload.stop() + # Disable scheduling, so the storage controller doesn't migrate shards around + # while we are stopping pageservers + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.extend( + [".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"] + ) + # Stop pageservers for pageserver in env.pageservers: pageserver.stop() @@ -127,9 +134,16 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: for pageserver in env.pageservers: pageserver.start() + # Turn scheduling back on. + # We don't care about optimizations, so enable only essential scheduling + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"}) + # Check we can read everything workload.validate() + # Reconcile to avoid a race between test shutdown and background reconciliation (#11278) + env.storage_controller.reconcile_until_idle() + def drop_local_state(env: NeonEnv, tenant_id: TenantId): env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})