storcon: Ignore stuck reconciles when considering optimizations (#12589)

## Problem The `keep_failing_reconciles` counter was introduced in #12391, but there is a special case: > if a reconciliation loop claims to have succeeded, but maybe_reconcile still thinks the tenant is in need of reconciliation, then that's a probable bug and we should activate a similar backoff to prevent flapping. This PR redefines "flapping" to include not just repeated failures, but also consecutive reconciliations of any kind (success or failure). ## Summary of Changes - Replace `keep_failing_reconciles` with a new `stuck_reconciles` metric - Replace `MAX_CONSECUTIVE_RECONCILIATION_ERRORS` with `MAX_CONSECUTIVE_RECONCILES`, and increasing that from 5 to 10 - Increment the consecutive reconciles counter for all reconciles, not just failures - Reset the counter in `reconcile_all` when no reconcile is needed for a shard - Improve and fix the related test --------- Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
2026-01-04 12:02:55 +00:00 · 2025-07-17 18:52:57 +04:00
parent 8862e7c4bf
commit f0c0733a64
4 changed files with 46 additions and 45 deletions
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -76,8 +76,8 @@ pub(crate) struct StorageControllerMetricGroup {
    /// How many shards would like to reconcile but were blocked by concurrency limits
    pub(crate) storage_controller_pending_reconciles: measured::Gauge,

-    /// How many shards are keep-failing and will be ignored when considering to run optimizations
-    pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge,
+    /// How many shards are stuck and will be ignored when considering to run optimizations
+    pub(crate) storage_controller_stuck_reconciles: measured::Gauge,

    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_http_request_status:
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -232,9 +232,9 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;

-// Number of consecutive reconciliation errors, occured for one shard,
+// Number of consecutive reconciliations that have occurred for one shard,
 // after which the shard is ignored when considering to run optimizations.
-const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5;
+const MAX_CONSECUTIVE_RECONCILES: usize = 10;

 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -735,31 +735,31 @@ struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);

 struct ReconcileAllResult {
    spawned_reconciles: usize,
-    keep_failing_reconciles: usize,
+    stuck_reconciles: usize,
    has_delayed_reconciles: bool,
 }

 impl ReconcileAllResult {
    fn new(
        spawned_reconciles: usize,
-        keep_failing_reconciles: usize,
+        stuck_reconciles: usize,
        has_delayed_reconciles: bool,
    ) -> Self {
        assert!(
-            spawned_reconciles >= keep_failing_reconciles,
-            "It is impossible to have more keep-failing reconciles than spawned reconciles"
+            spawned_reconciles >= stuck_reconciles,
+            "It is impossible to have less spawned reconciles than stuck reconciles"
        );
        Self {
            spawned_reconciles,
-            keep_failing_reconciles,
+            stuck_reconciles,
            has_delayed_reconciles,
        }
    }

    /// We can run optimizations only if we don't have any delayed reconciles and
-    /// all spawned reconciles are also keep-failing reconciles.
+    /// all spawned reconciles are also stuck reconciles.
    fn can_run_optimizations(&self) -> bool {
-        !self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles
+        !self.has_delayed_reconciles && self.spawned_reconciles == self.stuck_reconciles
    }
 }

@@ -1503,7 +1503,6 @@ impl Service {

        match result.result {
            Ok(()) => {
-                tenant.consecutive_errors_count = 0;
                tenant.apply_observed_deltas(deltas);
                tenant.waiter.advance(result.sequence);
            }
@@ -1522,8 +1521,6 @@ impl Service {
                    }
                }

-                tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1);
-
                // Ordering: populate last_error before advancing error_seq,
                // so that waiters will see the correct error after waiting.
                tenant.set_last_error(result.sequence, e);
@@ -1535,6 +1532,8 @@ impl Service {
            }
        }

+        tenant.consecutive_reconciles_count = tenant.consecutive_reconciles_count.saturating_add(1);
+
        // If we just finished detaching all shards for a tenant, it might be time to drop it from memory.
        if tenant.policy == PlacementPolicy::Detached {
            // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us
@@ -8640,7 +8639,7 @@ impl Service {
        // This function is an efficient place to update lazy statistics, since we are walking
        // all tenants.
        let mut pending_reconciles = 0;
-        let mut keep_failing_reconciles = 0;
+        let mut stuck_reconciles = 0;
        let mut az_violations = 0;

        // If we find any tenants to drop from memory, stash them to offload after
@@ -8676,30 +8675,32 @@ impl Service {

            // Eventual consistency: if an earlier reconcile job failed, and the shard is still
            // dirty, spawn another one
-            let consecutive_errors_count = shard.consecutive_errors_count;
            if self
                .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
                .is_some()
            {
                spawned_reconciles += 1;

-                // Count shards that are keep-failing. We still want to reconcile them
-                // to avoid a situation where a shard is stuck.
-                // But we don't want to consider them when deciding to run optimizations.
-                if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS {
+                if shard.consecutive_reconciles_count >= MAX_CONSECUTIVE_RECONCILES {
+                    // Count shards that are stuck, butwe still want to reconcile them.
+                    // We don't want to consider them when deciding to run optimizations.
                    tracing::warn!(
                        tenant_id=%shard.tenant_shard_id.tenant_id,
                        shard_id=%shard.tenant_shard_id.shard_slug(),
-                        "Shard reconciliation is keep-failing: {} errors",
-                        consecutive_errors_count
+                        "Shard reconciliation is stuck: {} consecutive launches",
+                        shard.consecutive_reconciles_count
                    );
-                    keep_failing_reconciles += 1;
+                    stuck_reconciles += 1;
+                }
+            } else {
+                if shard.delayed_reconcile {
+                    // Shard wanted to reconcile but for some reason couldn't.
+                    pending_reconciles += 1;
                }
-            } else if shard.delayed_reconcile {
-                // Shard wanted to reconcile but for some reason couldn't.
-                pending_reconciles += 1;
-            }

+                // Reset the counter when we don't need to launch a reconcile.
+                shard.consecutive_reconciles_count = 0;
+            }
            // If this tenant is detached, try dropping it from memory. This is usually done
            // proactively in [`Self::process_results`], but we do it here to handle the edge
            // case where a reconcile completes while someone else is holding an op lock for the tenant.
@@ -8735,14 +8736,10 @@ impl Service {

        metrics::METRICS_REGISTRY
            .metrics_group
-            .storage_controller_keep_failing_reconciles
-            .set(keep_failing_reconciles as i64);
+            .storage_controller_stuck_reconciles
+            .set(stuck_reconciles as i64);

-        ReconcileAllResult::new(
-            spawned_reconciles,
-            keep_failing_reconciles,
-            has_delayed_reconciles,
-        )
+        ReconcileAllResult::new(spawned_reconciles, stuck_reconciles, has_delayed_reconciles)
    }

    /// `optimize` in this context means identifying shards which have valid scheduled locations, but
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -131,14 +131,16 @@ pub(crate) struct TenantShard {
    #[serde(serialize_with = "read_last_error")]
    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,

-    /// Number of consecutive reconciliation errors that have occurred for this shard.
+    /// Amount of consecutive [`crate::service::Service::reconcile_all`] iterations that have been
+    /// scheduled a reconciliation for this shard.
    ///
-    /// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard
-    /// will be countered as keep-failing in `reconcile_all` calculations. This will lead to
-    /// allowing optimizations to run even with some failing shards.
+    /// If this reaches `MAX_CONSECUTIVE_RECONCILES`, the shard is considered "stuck" and will be
+    /// ignored when deciding whether optimizations can run. This includes both successful and failed
+    /// reconciliations.
    ///
-    /// The counter is reset to 0 after a successful reconciliation.
-    pub(crate) consecutive_errors_count: usize,
+    /// Incremented in [`crate::service::Service::process_result`], and reset to 0 when
+    /// [`crate::service::Service::reconcile_all`] determines no reconciliation is needed for this shard.
+    pub(crate) consecutive_reconciles_count: usize,

    /// If we have a pending compute notification that for some reason we weren't able to send,
    /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
@@ -603,7 +605,7 @@ impl TenantShard {
            waiter: Arc::new(SeqWait::new(Sequence(0))),
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
-            consecutive_errors_count: 0,
+            consecutive_reconciles_count: 0,
            pending_compute_notification: false,
            scheduling_policy: ShardSchedulingPolicy::default(),
            preferred_node: None,
@@ -1908,7 +1910,7 @@ impl TenantShard {
            waiter: Arc::new(SeqWait::new(Sequence::initial())),
            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
            last_error: Arc::default(),
-            consecutive_errors_count: 0,
+            consecutive_reconciles_count: 0,
            pending_compute_notification: false,
            delayed_reconcile: false,
            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -996,7 +996,7 @@ def test_storage_controller_compute_hook_retry(


@run_only_on_default_postgres("postgres behavior is not relevant")
-def test_storage_controller_compute_hook_keep_failing(
+def test_storage_controller_compute_hook_stuck_reconciles(
    httpserver: HTTPServer,
    neon_env_builder: NeonEnvBuilder,
    httpserver_listen_address: ListenAddress,
@@ -1046,7 +1046,7 @@ def test_storage_controller_compute_hook_keep_failing(
    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
    env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
-    env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
+    env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*")
    env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})

    # Migrate all allowed tenant shards to the first alive pageserver
@@ -1061,7 +1061,7 @@ def test_storage_controller_compute_hook_keep_failing(

    # Make some reconcile_all calls to trigger optimizations
    # RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
-    RECONCILE_COUNT = 12
+    RECONCILE_COUNT = 20
    for i in range(RECONCILE_COUNT):
        try:
            n = env.storage_controller.reconcile_all()
@@ -1074,6 +1074,8 @@ def test_storage_controller_compute_hook_keep_failing(
        assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
        time.sleep(2)

+    env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*")
+
    # Check that the allowed tenant shards are optimized due to affinity rules
    locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
    not_optimized_shard_count = 0