diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 0c923e742e..9c34b34044 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -76,8 +76,8 @@ pub(crate) struct StorageControllerMetricGroup { /// How many shards would like to reconcile but were blocked by concurrency limits pub(crate) storage_controller_pending_reconciles: measured::Gauge, - /// How many shards are keep-failing and will be ignored when considering to run optimizations - pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge, + /// How many shards are stuck and will be ignored when considering to run optimizations + pub(crate) storage_controller_stuck_reconciles: measured::Gauge, /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index b315b88fcc..ec3b419437 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -232,9 +232,9 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; -// Number of consecutive reconciliation errors, occured for one shard, +// Number of consecutive reconciliations that have occurred for one shard, // after which the shard is ignored when considering to run optimizations. -const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5; +const MAX_CONSECUTIVE_RECONCILES: usize = 10; // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly @@ -735,31 +735,31 @@ struct TenantMutationLocations(BTreeMap); struct ReconcileAllResult { spawned_reconciles: usize, - keep_failing_reconciles: usize, + stuck_reconciles: usize, has_delayed_reconciles: bool, } impl ReconcileAllResult { fn new( spawned_reconciles: usize, - keep_failing_reconciles: usize, + stuck_reconciles: usize, has_delayed_reconciles: bool, ) -> Self { assert!( - spawned_reconciles >= keep_failing_reconciles, - "It is impossible to have more keep-failing reconciles than spawned reconciles" + spawned_reconciles >= stuck_reconciles, + "It is impossible to have less spawned reconciles than stuck reconciles" ); Self { spawned_reconciles, - keep_failing_reconciles, + stuck_reconciles, has_delayed_reconciles, } } /// We can run optimizations only if we don't have any delayed reconciles and - /// all spawned reconciles are also keep-failing reconciles. + /// all spawned reconciles are also stuck reconciles. fn can_run_optimizations(&self) -> bool { - !self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles + !self.has_delayed_reconciles && self.spawned_reconciles == self.stuck_reconciles } } @@ -1503,7 +1503,6 @@ impl Service { match result.result { Ok(()) => { - tenant.consecutive_errors_count = 0; tenant.apply_observed_deltas(deltas); tenant.waiter.advance(result.sequence); } @@ -1522,8 +1521,6 @@ impl Service { } } - tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1); - // Ordering: populate last_error before advancing error_seq, // so that waiters will see the correct error after waiting. tenant.set_last_error(result.sequence, e); @@ -1535,6 +1532,8 @@ impl Service { } } + tenant.consecutive_reconciles_count = tenant.consecutive_reconciles_count.saturating_add(1); + // If we just finished detaching all shards for a tenant, it might be time to drop it from memory. if tenant.policy == PlacementPolicy::Detached { // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us @@ -8640,7 +8639,7 @@ impl Service { // This function is an efficient place to update lazy statistics, since we are walking // all tenants. let mut pending_reconciles = 0; - let mut keep_failing_reconciles = 0; + let mut stuck_reconciles = 0; let mut az_violations = 0; // If we find any tenants to drop from memory, stash them to offload after @@ -8676,30 +8675,32 @@ impl Service { // Eventual consistency: if an earlier reconcile job failed, and the shard is still // dirty, spawn another one - let consecutive_errors_count = shard.consecutive_errors_count; if self .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) .is_some() { spawned_reconciles += 1; - // Count shards that are keep-failing. We still want to reconcile them - // to avoid a situation where a shard is stuck. - // But we don't want to consider them when deciding to run optimizations. - if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS { + if shard.consecutive_reconciles_count >= MAX_CONSECUTIVE_RECONCILES { + // Count shards that are stuck, butwe still want to reconcile them. + // We don't want to consider them when deciding to run optimizations. tracing::warn!( tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), - "Shard reconciliation is keep-failing: {} errors", - consecutive_errors_count + "Shard reconciliation is stuck: {} consecutive launches", + shard.consecutive_reconciles_count ); - keep_failing_reconciles += 1; + stuck_reconciles += 1; + } + } else { + if shard.delayed_reconcile { + // Shard wanted to reconcile but for some reason couldn't. + pending_reconciles += 1; } - } else if shard.delayed_reconcile { - // Shard wanted to reconcile but for some reason couldn't. - pending_reconciles += 1; - } + // Reset the counter when we don't need to launch a reconcile. + shard.consecutive_reconciles_count = 0; + } // If this tenant is detached, try dropping it from memory. This is usually done // proactively in [`Self::process_results`], but we do it here to handle the edge // case where a reconcile completes while someone else is holding an op lock for the tenant. @@ -8735,14 +8736,10 @@ impl Service { metrics::METRICS_REGISTRY .metrics_group - .storage_controller_keep_failing_reconciles - .set(keep_failing_reconciles as i64); + .storage_controller_stuck_reconciles + .set(stuck_reconciles as i64); - ReconcileAllResult::new( - spawned_reconciles, - keep_failing_reconciles, - has_delayed_reconciles, - ) + ReconcileAllResult::new(spawned_reconciles, stuck_reconciles, has_delayed_reconciles) } /// `optimize` in this context means identifying shards which have valid scheduled locations, but diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 99079c57b0..05de155963 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -131,14 +131,16 @@ pub(crate) struct TenantShard { #[serde(serialize_with = "read_last_error")] pub(crate) last_error: std::sync::Arc>>>, - /// Number of consecutive reconciliation errors that have occurred for this shard. + /// Amount of consecutive [`crate::service::Service::reconcile_all`] iterations that have been + /// scheduled a reconciliation for this shard. /// - /// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard - /// will be countered as keep-failing in `reconcile_all` calculations. This will lead to - /// allowing optimizations to run even with some failing shards. + /// If this reaches `MAX_CONSECUTIVE_RECONCILES`, the shard is considered "stuck" and will be + /// ignored when deciding whether optimizations can run. This includes both successful and failed + /// reconciliations. /// - /// The counter is reset to 0 after a successful reconciliation. - pub(crate) consecutive_errors_count: usize, + /// Incremented in [`crate::service::Service::process_result`], and reset to 0 when + /// [`crate::service::Service::reconcile_all`] determines no reconciliation is needed for this shard. + pub(crate) consecutive_reconciles_count: usize, /// If we have a pending compute notification that for some reason we weren't able to send, /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes @@ -603,7 +605,7 @@ impl TenantShard { waiter: Arc::new(SeqWait::new(Sequence(0))), error_waiter: Arc::new(SeqWait::new(Sequence(0))), last_error: Arc::default(), - consecutive_errors_count: 0, + consecutive_reconciles_count: 0, pending_compute_notification: false, scheduling_policy: ShardSchedulingPolicy::default(), preferred_node: None, @@ -1908,7 +1910,7 @@ impl TenantShard { waiter: Arc::new(SeqWait::new(Sequence::initial())), error_waiter: Arc::new(SeqWait::new(Sequence::initial())), last_error: Arc::default(), - consecutive_errors_count: 0, + consecutive_reconciles_count: 0, pending_compute_notification: false, delayed_reconcile: false, scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index d1e9bbd7dc..fbdb14b6bb 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -996,7 +996,7 @@ def test_storage_controller_compute_hook_retry( @run_only_on_default_postgres("postgres behavior is not relevant") -def test_storage_controller_compute_hook_keep_failing( +def test_storage_controller_compute_hook_stuck_reconciles( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, httpserver_listen_address: ListenAddress, @@ -1046,7 +1046,7 @@ def test_storage_controller_compute_hook_keep_failing( env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*") - env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*") + env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*") env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"}) # Migrate all allowed tenant shards to the first alive pageserver @@ -1061,7 +1061,7 @@ def test_storage_controller_compute_hook_keep_failing( # Make some reconcile_all calls to trigger optimizations # RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS - RECONCILE_COUNT = 12 + RECONCILE_COUNT = 20 for i in range(RECONCILE_COUNT): try: n = env.storage_controller.reconcile_all() @@ -1074,6 +1074,8 @@ def test_storage_controller_compute_hook_keep_failing( assert banned_descr["shards"][0]["is_pending_compute_notification"] is True time.sleep(2) + env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*") + # Check that the allowed tenant shards are optimized due to affinity rules locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"] not_optimized_shard_count = 0