mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 12:02:55 +00:00
storcon: Ignore stuck reconciles when considering optimizations (#12589)
## Problem The `keep_failing_reconciles` counter was introduced in #12391, but there is a special case: > if a reconciliation loop claims to have succeeded, but maybe_reconcile still thinks the tenant is in need of reconciliation, then that's a probable bug and we should activate a similar backoff to prevent flapping. This PR redefines "flapping" to include not just repeated failures, but also consecutive reconciliations of any kind (success or failure). ## Summary of Changes - Replace `keep_failing_reconciles` with a new `stuck_reconciles` metric - Replace `MAX_CONSECUTIVE_RECONCILIATION_ERRORS` with `MAX_CONSECUTIVE_RECONCILES`, and increasing that from 5 to 10 - Increment the consecutive reconciles counter for all reconciles, not just failures - Reset the counter in `reconcile_all` when no reconcile is needed for a shard - Improve and fix the related test --------- Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
This commit is contained in:
committed by
GitHub
parent
8862e7c4bf
commit
f0c0733a64
@@ -76,8 +76,8 @@ pub(crate) struct StorageControllerMetricGroup {
|
||||
/// How many shards would like to reconcile but were blocked by concurrency limits
|
||||
pub(crate) storage_controller_pending_reconciles: measured::Gauge,
|
||||
|
||||
/// How many shards are keep-failing and will be ignored when considering to run optimizations
|
||||
pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge,
|
||||
/// How many shards are stuck and will be ignored when considering to run optimizations
|
||||
pub(crate) storage_controller_stuck_reconciles: measured::Gauge,
|
||||
|
||||
/// HTTP request status counters for handled requests
|
||||
pub(crate) storage_controller_http_request_status:
|
||||
|
||||
@@ -232,9 +232,9 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
|
||||
pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
|
||||
pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
|
||||
|
||||
// Number of consecutive reconciliation errors, occured for one shard,
|
||||
// Number of consecutive reconciliations that have occurred for one shard,
|
||||
// after which the shard is ignored when considering to run optimizations.
|
||||
const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5;
|
||||
const MAX_CONSECUTIVE_RECONCILES: usize = 10;
|
||||
|
||||
// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
|
||||
// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
|
||||
@@ -735,31 +735,31 @@ struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);
|
||||
|
||||
struct ReconcileAllResult {
|
||||
spawned_reconciles: usize,
|
||||
keep_failing_reconciles: usize,
|
||||
stuck_reconciles: usize,
|
||||
has_delayed_reconciles: bool,
|
||||
}
|
||||
|
||||
impl ReconcileAllResult {
|
||||
fn new(
|
||||
spawned_reconciles: usize,
|
||||
keep_failing_reconciles: usize,
|
||||
stuck_reconciles: usize,
|
||||
has_delayed_reconciles: bool,
|
||||
) -> Self {
|
||||
assert!(
|
||||
spawned_reconciles >= keep_failing_reconciles,
|
||||
"It is impossible to have more keep-failing reconciles than spawned reconciles"
|
||||
spawned_reconciles >= stuck_reconciles,
|
||||
"It is impossible to have less spawned reconciles than stuck reconciles"
|
||||
);
|
||||
Self {
|
||||
spawned_reconciles,
|
||||
keep_failing_reconciles,
|
||||
stuck_reconciles,
|
||||
has_delayed_reconciles,
|
||||
}
|
||||
}
|
||||
|
||||
/// We can run optimizations only if we don't have any delayed reconciles and
|
||||
/// all spawned reconciles are also keep-failing reconciles.
|
||||
/// all spawned reconciles are also stuck reconciles.
|
||||
fn can_run_optimizations(&self) -> bool {
|
||||
!self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles
|
||||
!self.has_delayed_reconciles && self.spawned_reconciles == self.stuck_reconciles
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1503,7 +1503,6 @@ impl Service {
|
||||
|
||||
match result.result {
|
||||
Ok(()) => {
|
||||
tenant.consecutive_errors_count = 0;
|
||||
tenant.apply_observed_deltas(deltas);
|
||||
tenant.waiter.advance(result.sequence);
|
||||
}
|
||||
@@ -1522,8 +1521,6 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1);
|
||||
|
||||
// Ordering: populate last_error before advancing error_seq,
|
||||
// so that waiters will see the correct error after waiting.
|
||||
tenant.set_last_error(result.sequence, e);
|
||||
@@ -1535,6 +1532,8 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
tenant.consecutive_reconciles_count = tenant.consecutive_reconciles_count.saturating_add(1);
|
||||
|
||||
// If we just finished detaching all shards for a tenant, it might be time to drop it from memory.
|
||||
if tenant.policy == PlacementPolicy::Detached {
|
||||
// We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us
|
||||
@@ -8640,7 +8639,7 @@ impl Service {
|
||||
// This function is an efficient place to update lazy statistics, since we are walking
|
||||
// all tenants.
|
||||
let mut pending_reconciles = 0;
|
||||
let mut keep_failing_reconciles = 0;
|
||||
let mut stuck_reconciles = 0;
|
||||
let mut az_violations = 0;
|
||||
|
||||
// If we find any tenants to drop from memory, stash them to offload after
|
||||
@@ -8676,30 +8675,32 @@ impl Service {
|
||||
|
||||
// Eventual consistency: if an earlier reconcile job failed, and the shard is still
|
||||
// dirty, spawn another one
|
||||
let consecutive_errors_count = shard.consecutive_errors_count;
|
||||
if self
|
||||
.maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
|
||||
.is_some()
|
||||
{
|
||||
spawned_reconciles += 1;
|
||||
|
||||
// Count shards that are keep-failing. We still want to reconcile them
|
||||
// to avoid a situation where a shard is stuck.
|
||||
// But we don't want to consider them when deciding to run optimizations.
|
||||
if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS {
|
||||
if shard.consecutive_reconciles_count >= MAX_CONSECUTIVE_RECONCILES {
|
||||
// Count shards that are stuck, butwe still want to reconcile them.
|
||||
// We don't want to consider them when deciding to run optimizations.
|
||||
tracing::warn!(
|
||||
tenant_id=%shard.tenant_shard_id.tenant_id,
|
||||
shard_id=%shard.tenant_shard_id.shard_slug(),
|
||||
"Shard reconciliation is keep-failing: {} errors",
|
||||
consecutive_errors_count
|
||||
"Shard reconciliation is stuck: {} consecutive launches",
|
||||
shard.consecutive_reconciles_count
|
||||
);
|
||||
keep_failing_reconciles += 1;
|
||||
stuck_reconciles += 1;
|
||||
}
|
||||
} else {
|
||||
if shard.delayed_reconcile {
|
||||
// Shard wanted to reconcile but for some reason couldn't.
|
||||
pending_reconciles += 1;
|
||||
}
|
||||
} else if shard.delayed_reconcile {
|
||||
// Shard wanted to reconcile but for some reason couldn't.
|
||||
pending_reconciles += 1;
|
||||
}
|
||||
|
||||
// Reset the counter when we don't need to launch a reconcile.
|
||||
shard.consecutive_reconciles_count = 0;
|
||||
}
|
||||
// If this tenant is detached, try dropping it from memory. This is usually done
|
||||
// proactively in [`Self::process_results`], but we do it here to handle the edge
|
||||
// case where a reconcile completes while someone else is holding an op lock for the tenant.
|
||||
@@ -8735,14 +8736,10 @@ impl Service {
|
||||
|
||||
metrics::METRICS_REGISTRY
|
||||
.metrics_group
|
||||
.storage_controller_keep_failing_reconciles
|
||||
.set(keep_failing_reconciles as i64);
|
||||
.storage_controller_stuck_reconciles
|
||||
.set(stuck_reconciles as i64);
|
||||
|
||||
ReconcileAllResult::new(
|
||||
spawned_reconciles,
|
||||
keep_failing_reconciles,
|
||||
has_delayed_reconciles,
|
||||
)
|
||||
ReconcileAllResult::new(spawned_reconciles, stuck_reconciles, has_delayed_reconciles)
|
||||
}
|
||||
|
||||
/// `optimize` in this context means identifying shards which have valid scheduled locations, but
|
||||
|
||||
@@ -131,14 +131,16 @@ pub(crate) struct TenantShard {
|
||||
#[serde(serialize_with = "read_last_error")]
|
||||
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
|
||||
|
||||
/// Number of consecutive reconciliation errors that have occurred for this shard.
|
||||
/// Amount of consecutive [`crate::service::Service::reconcile_all`] iterations that have been
|
||||
/// scheduled a reconciliation for this shard.
|
||||
///
|
||||
/// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard
|
||||
/// will be countered as keep-failing in `reconcile_all` calculations. This will lead to
|
||||
/// allowing optimizations to run even with some failing shards.
|
||||
/// If this reaches `MAX_CONSECUTIVE_RECONCILES`, the shard is considered "stuck" and will be
|
||||
/// ignored when deciding whether optimizations can run. This includes both successful and failed
|
||||
/// reconciliations.
|
||||
///
|
||||
/// The counter is reset to 0 after a successful reconciliation.
|
||||
pub(crate) consecutive_errors_count: usize,
|
||||
/// Incremented in [`crate::service::Service::process_result`], and reset to 0 when
|
||||
/// [`crate::service::Service::reconcile_all`] determines no reconciliation is needed for this shard.
|
||||
pub(crate) consecutive_reconciles_count: usize,
|
||||
|
||||
/// If we have a pending compute notification that for some reason we weren't able to send,
|
||||
/// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
|
||||
@@ -603,7 +605,7 @@ impl TenantShard {
|
||||
waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
last_error: Arc::default(),
|
||||
consecutive_errors_count: 0,
|
||||
consecutive_reconciles_count: 0,
|
||||
pending_compute_notification: false,
|
||||
scheduling_policy: ShardSchedulingPolicy::default(),
|
||||
preferred_node: None,
|
||||
@@ -1908,7 +1910,7 @@ impl TenantShard {
|
||||
waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
|
||||
last_error: Arc::default(),
|
||||
consecutive_errors_count: 0,
|
||||
consecutive_reconciles_count: 0,
|
||||
pending_compute_notification: false,
|
||||
delayed_reconcile: false,
|
||||
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
|
||||
|
||||
@@ -996,7 +996,7 @@ def test_storage_controller_compute_hook_retry(
|
||||
|
||||
|
||||
@run_only_on_default_postgres("postgres behavior is not relevant")
|
||||
def test_storage_controller_compute_hook_keep_failing(
|
||||
def test_storage_controller_compute_hook_stuck_reconciles(
|
||||
httpserver: HTTPServer,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
httpserver_listen_address: ListenAddress,
|
||||
@@ -1046,7 +1046,7 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
|
||||
env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
|
||||
env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
|
||||
env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
|
||||
env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*")
|
||||
env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
|
||||
|
||||
# Migrate all allowed tenant shards to the first alive pageserver
|
||||
@@ -1061,7 +1061,7 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
|
||||
# Make some reconcile_all calls to trigger optimizations
|
||||
# RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
|
||||
RECONCILE_COUNT = 12
|
||||
RECONCILE_COUNT = 20
|
||||
for i in range(RECONCILE_COUNT):
|
||||
try:
|
||||
n = env.storage_controller.reconcile_all()
|
||||
@@ -1074,6 +1074,8 @@ def test_storage_controller_compute_hook_keep_failing(
|
||||
assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
|
||||
time.sleep(2)
|
||||
|
||||
env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*")
|
||||
|
||||
# Check that the allowed tenant shards are optimized due to affinity rules
|
||||
locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
|
||||
not_optimized_shard_count = 0
|
||||
|
||||
Reference in New Issue
Block a user