storcon: Ignore stuck reconciles when considering optimizations (#12589)

## Problem

The `keep_failing_reconciles` counter was introduced in #12391, but
there is a special case:

> if a reconciliation loop claims to have succeeded, but maybe_reconcile
still thinks the tenant is in need of reconciliation, then that's a
probable bug and we should activate a similar backoff to prevent
flapping.

This PR redefines "flapping" to include not just repeated failures, but
also consecutive reconciliations of any kind (success or failure).

## Summary of Changes

- Replace `keep_failing_reconciles` with a new `stuck_reconciles` metric
- Replace `MAX_CONSECUTIVE_RECONCILIATION_ERRORS` with
`MAX_CONSECUTIVE_RECONCILES`, and increasing that from 5 to 10
- Increment the consecutive reconciles counter for all reconciles, not
just failures
- Reset the counter in `reconcile_all` when no reconcile is needed for a
shard
- Improve and fix the related test

---------

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
This commit is contained in:
Aleksandr Sarantsev
2025-07-17 18:52:57 +04:00
committed by GitHub
parent 8862e7c4bf
commit f0c0733a64
4 changed files with 46 additions and 45 deletions

View File

@@ -76,8 +76,8 @@ pub(crate) struct StorageControllerMetricGroup {
/// How many shards would like to reconcile but were blocked by concurrency limits
pub(crate) storage_controller_pending_reconciles: measured::Gauge,
/// How many shards are keep-failing and will be ignored when considering to run optimizations
pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge,
/// How many shards are stuck and will be ignored when considering to run optimizations
pub(crate) storage_controller_stuck_reconciles: measured::Gauge,
/// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status:

View File

@@ -232,9 +232,9 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
// Number of consecutive reconciliation errors, occured for one shard,
// Number of consecutive reconciliations that have occurred for one shard,
// after which the shard is ignored when considering to run optimizations.
const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5;
const MAX_CONSECUTIVE_RECONCILES: usize = 10;
// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -735,31 +735,31 @@ struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);
struct ReconcileAllResult {
spawned_reconciles: usize,
keep_failing_reconciles: usize,
stuck_reconciles: usize,
has_delayed_reconciles: bool,
}
impl ReconcileAllResult {
fn new(
spawned_reconciles: usize,
keep_failing_reconciles: usize,
stuck_reconciles: usize,
has_delayed_reconciles: bool,
) -> Self {
assert!(
spawned_reconciles >= keep_failing_reconciles,
"It is impossible to have more keep-failing reconciles than spawned reconciles"
spawned_reconciles >= stuck_reconciles,
"It is impossible to have less spawned reconciles than stuck reconciles"
);
Self {
spawned_reconciles,
keep_failing_reconciles,
stuck_reconciles,
has_delayed_reconciles,
}
}
/// We can run optimizations only if we don't have any delayed reconciles and
/// all spawned reconciles are also keep-failing reconciles.
/// all spawned reconciles are also stuck reconciles.
fn can_run_optimizations(&self) -> bool {
!self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles
!self.has_delayed_reconciles && self.spawned_reconciles == self.stuck_reconciles
}
}
@@ -1503,7 +1503,6 @@ impl Service {
match result.result {
Ok(()) => {
tenant.consecutive_errors_count = 0;
tenant.apply_observed_deltas(deltas);
tenant.waiter.advance(result.sequence);
}
@@ -1522,8 +1521,6 @@ impl Service {
}
}
tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1);
// Ordering: populate last_error before advancing error_seq,
// so that waiters will see the correct error after waiting.
tenant.set_last_error(result.sequence, e);
@@ -1535,6 +1532,8 @@ impl Service {
}
}
tenant.consecutive_reconciles_count = tenant.consecutive_reconciles_count.saturating_add(1);
// If we just finished detaching all shards for a tenant, it might be time to drop it from memory.
if tenant.policy == PlacementPolicy::Detached {
// We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us
@@ -8640,7 +8639,7 @@ impl Service {
// This function is an efficient place to update lazy statistics, since we are walking
// all tenants.
let mut pending_reconciles = 0;
let mut keep_failing_reconciles = 0;
let mut stuck_reconciles = 0;
let mut az_violations = 0;
// If we find any tenants to drop from memory, stash them to offload after
@@ -8676,30 +8675,32 @@ impl Service {
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
// dirty, spawn another one
let consecutive_errors_count = shard.consecutive_errors_count;
if self
.maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
.is_some()
{
spawned_reconciles += 1;
// Count shards that are keep-failing. We still want to reconcile them
// to avoid a situation where a shard is stuck.
// But we don't want to consider them when deciding to run optimizations.
if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS {
if shard.consecutive_reconciles_count >= MAX_CONSECUTIVE_RECONCILES {
// Count shards that are stuck, butwe still want to reconcile them.
// We don't want to consider them when deciding to run optimizations.
tracing::warn!(
tenant_id=%shard.tenant_shard_id.tenant_id,
shard_id=%shard.tenant_shard_id.shard_slug(),
"Shard reconciliation is keep-failing: {} errors",
consecutive_errors_count
"Shard reconciliation is stuck: {} consecutive launches",
shard.consecutive_reconciles_count
);
keep_failing_reconciles += 1;
stuck_reconciles += 1;
}
} else {
if shard.delayed_reconcile {
// Shard wanted to reconcile but for some reason couldn't.
pending_reconciles += 1;
}
} else if shard.delayed_reconcile {
// Shard wanted to reconcile but for some reason couldn't.
pending_reconciles += 1;
}
// Reset the counter when we don't need to launch a reconcile.
shard.consecutive_reconciles_count = 0;
}
// If this tenant is detached, try dropping it from memory. This is usually done
// proactively in [`Self::process_results`], but we do it here to handle the edge
// case where a reconcile completes while someone else is holding an op lock for the tenant.
@@ -8735,14 +8736,10 @@ impl Service {
metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_keep_failing_reconciles
.set(keep_failing_reconciles as i64);
.storage_controller_stuck_reconciles
.set(stuck_reconciles as i64);
ReconcileAllResult::new(
spawned_reconciles,
keep_failing_reconciles,
has_delayed_reconciles,
)
ReconcileAllResult::new(spawned_reconciles, stuck_reconciles, has_delayed_reconciles)
}
/// `optimize` in this context means identifying shards which have valid scheduled locations, but

View File

@@ -131,14 +131,16 @@ pub(crate) struct TenantShard {
#[serde(serialize_with = "read_last_error")]
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
/// Number of consecutive reconciliation errors that have occurred for this shard.
/// Amount of consecutive [`crate::service::Service::reconcile_all`] iterations that have been
/// scheduled a reconciliation for this shard.
///
/// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard
/// will be countered as keep-failing in `reconcile_all` calculations. This will lead to
/// allowing optimizations to run even with some failing shards.
/// If this reaches `MAX_CONSECUTIVE_RECONCILES`, the shard is considered "stuck" and will be
/// ignored when deciding whether optimizations can run. This includes both successful and failed
/// reconciliations.
///
/// The counter is reset to 0 after a successful reconciliation.
pub(crate) consecutive_errors_count: usize,
/// Incremented in [`crate::service::Service::process_result`], and reset to 0 when
/// [`crate::service::Service::reconcile_all`] determines no reconciliation is needed for this shard.
pub(crate) consecutive_reconciles_count: usize,
/// If we have a pending compute notification that for some reason we weren't able to send,
/// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
@@ -603,7 +605,7 @@ impl TenantShard {
waiter: Arc::new(SeqWait::new(Sequence(0))),
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
last_error: Arc::default(),
consecutive_errors_count: 0,
consecutive_reconciles_count: 0,
pending_compute_notification: false,
scheduling_policy: ShardSchedulingPolicy::default(),
preferred_node: None,
@@ -1908,7 +1910,7 @@ impl TenantShard {
waiter: Arc::new(SeqWait::new(Sequence::initial())),
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
consecutive_errors_count: 0,
consecutive_reconciles_count: 0,
pending_compute_notification: false,
delayed_reconcile: false,
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),