From 38a8dcab9f87b3a2fd868ff6f5bbfa4303cabc24 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 2 Oct 2024 17:25:11 +0100 Subject: [PATCH] storcon: add metric for long running reconciles (#9207) ## Problem We don't have an alert for long running reconciles. Stuck reconciles are problematic as we've seen in a recent incident. ## Summary of changes Add a new metric `storage_controller_reconcile_long_running_total` with labels: `{tenant_id, shard_number, seq}`. The metric is removed after the long running reconcile finishes. These events should be rare, so we won't break the bank on cardinality. Related https://github.com/neondatabase/neon/issues/9150 --- control_plane/src/local_env.rs | 4 + control_plane/src/storage_controller.rs | 7 + storage_controller/src/main.rs | 11 +- storage_controller/src/metrics.rs | 15 +++ storage_controller/src/service.rs | 5 + storage_controller/src/tenant_shard.rs | 127 +++++++++++++----- .../regress/test_storage_controller.py | 35 +++++ 7 files changed, 168 insertions(+), 36 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index d616154af6..9dc2a0c36b 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -168,6 +168,9 @@ pub struct NeonStorageControllerConf { #[serde(with = "humantime_serde")] pub heartbeat_interval: Duration, + + #[serde(with = "humantime_serde")] + pub long_reconcile_threshold: Option, } impl NeonStorageControllerConf { @@ -190,6 +193,7 @@ impl Default for NeonStorageControllerConf { split_threshold: None, max_secondary_lag_bytes: None, heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, + long_reconcile_threshold: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 36e5e04c86..6d07c43af0 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -517,6 +517,13 @@ impl StorageController { args.push(format!("--max-secondary-lag-bytes={lag}")) } + if let Some(threshold) = self.config.long_reconcile_threshold { + args.push(format!( + "--long-reconcile-threshold={}", + humantime::Duration::from(threshold) + )) + } + args.push(format!( "--neon-local-repo-dir={}", self.env.base_data_dir.display() diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 00e90f4467..62cb0e9e5d 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, - MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, + MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -108,6 +108,9 @@ struct Cli { // Period with which to send heartbeats to registered nodes #[arg(long)] heartbeat_interval: Option, + + #[arg(long)] + long_reconcile_threshold: Option, } enum StrictMode { @@ -293,6 +296,10 @@ async fn async_main() -> anyhow::Result<()> { .heartbeat_interval .map(humantime::Duration::into) .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT), + long_reconcile_threshold: args + .long_reconcile_threshold + .map(humantime::Duration::into) + .unwrap_or(LONG_RECONCILE_THRESHOLD_DEFAULT), address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 5cfcfb4b1f..5989aeba91 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -87,6 +87,10 @@ pub(crate) struct StorageControllerMetricGroup { measured::HistogramVec, pub(crate) storage_controller_leadership_status: measured::GaugeVec, + + /// HTTP request status counters for handled requests + pub(crate) storage_controller_reconcile_long_running: + measured::CounterVec, } impl StorageControllerMetrics { @@ -168,6 +172,17 @@ pub(crate) struct LeadershipStatusGroup { pub(crate) status: LeadershipStatus, } +#[derive(measured::LabelGroup, Clone)] +#[label(set = ReconcileLongRunningLabelGroupSet)] +pub(crate) struct ReconcileLongRunningLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) tenant_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) shard_number: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) sequence: &'a str, +} + #[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 851db97310..bd5759422c 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -128,6 +128,9 @@ pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); /// How often to send heartbeats to registered nodes? pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5); +/// How long is too long for a reconciliation? +pub const LONG_RECONCILE_THRESHOLD_DEFAULT: Duration = Duration::from_secs(120); + #[derive(Clone, strum_macros::Display)] enum TenantOperations { Create, @@ -348,6 +351,8 @@ pub struct Config { pub start_as_candidate: bool, pub http_service_port: i32, + + pub long_reconcile_threshold: Duration, } impl From for ApiError { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index afc89eae00..953c73119b 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -5,7 +5,9 @@ use std::{ }; use crate::{ - metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, + metrics::{ + self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, + }, persistence::TenantShardPersistence, reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{ @@ -14,6 +16,7 @@ use crate::{ }, service::ReconcileResultRequest, }; +use futures::future::{self, Either}; use pageserver_api::controller_api::{ AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, }; @@ -1083,6 +1086,47 @@ impl TenantShard { } } + async fn reconcile( + sequence: Sequence, + mut reconciler: Reconciler, + must_notify: bool, + ) -> ReconcileResult { + // Attempt to make observed state match intent state + let result = reconciler.reconcile().await; + + // If we know we had a pending compute notification from some previous action, send a notification irrespective + // of whether the above reconcile() did any work + if result.is_ok() && must_notify { + // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] + reconciler.compute_notify().await.ok(); + } + + // Update result counter + let outcome_label = match &result { + Ok(_) => ReconcileOutcome::Success, + Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, + Err(_) => ReconcileOutcome::Error, + }; + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: outcome_label, + }); + + // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might + // try and schedule more work in response to our result. + ReconcileResult { + sequence, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + } + } + #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( @@ -1122,7 +1166,7 @@ impl TenantShard { let reconciler_cancel = cancel.child_token(); let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); - let mut reconciler = Reconciler { + let reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, shard: self.shard, placement_policy: self.policy.clone(), @@ -1142,6 +1186,7 @@ impl TenantShard { }; let reconcile_seq = self.sequence; + let long_reconcile_threshold = service_config.long_reconcile_threshold; tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); let must_notify = self.pending_compute_notification; @@ -1178,41 +1223,55 @@ impl TenantShard { return; } - // Attempt to make observed state match intent state - let result = reconciler.reconcile().await; + let (tenant_id_label, shard_number_label, sequence_label) = { + ( + reconciler.tenant_shard_id.tenant_id.to_string(), + reconciler.tenant_shard_id.shard_number.0.to_string(), + reconcile_seq.to_string(), + ) + }; - // If we know we had a pending compute notification from some previous action, send a notification irrespective - // of whether the above reconcile() did any work - if result.is_ok() && must_notify { - // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] - reconciler.compute_notify().await.ok(); + let label_group = ReconcileLongRunningLabelGroup { + tenant_id: &tenant_id_label, + shard_number: &shard_number_label, + sequence: &sequence_label, + }; + + let reconcile_fut = Self::reconcile(reconcile_seq, reconciler, must_notify); + let long_reconcile_fut = { + let label_group = label_group.clone(); + async move { + tokio::time::sleep(long_reconcile_threshold).await; + + tracing::warn!("Reconcile passed the long running threshold of {long_reconcile_threshold:?}"); + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .inc(label_group); + } + }; + + let reconcile_fut = std::pin::pin!(reconcile_fut); + let long_reconcile_fut = std::pin::pin!(long_reconcile_fut); + + let (was_long, result) = + match future::select(reconcile_fut, long_reconcile_fut).await { + Either::Left((reconcile_result, _)) => (false, reconcile_result), + Either::Right((_, reconcile_fut)) => (true, reconcile_fut.await), + }; + + if was_long { + let id = metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .with_labels(label_group); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .remove_metric(id); } - // Update result counter - let outcome_label = match &result { - Ok(_) => ReconcileOutcome::Success, - Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, - Err(_) => ReconcileOutcome::Error, - }; - - metrics::METRICS_REGISTRY - .metrics_group - .storage_controller_reconcile_complete - .inc(ReconcileCompleteLabelGroup { - status: outcome_label, - }); - - // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might - // try and schedule more work in response to our result. - let result = ReconcileResult { - sequence: reconcile_seq, - result, - tenant_shard_id: reconciler.tenant_shard_id, - generation: reconciler.generation, - observed: reconciler.observed, - pending_compute_notification: reconciler.compute_notify_failure, - }; - result_tx .send(ReconcileResultRequest::ReconcileResult(result)) .ok(); diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 23a9ef58df..11c743e8a0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2641,6 +2641,12 @@ def test_storage_controller_proxy_during_migration( """ neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + neon_env_builder.storage_controller_config = { + # Publish long reconcile metric early + "long_reconcile_threshold": "5s", + } + env = neon_env_builder.init_configs() env.start() @@ -2648,6 +2654,12 @@ def test_storage_controller_proxy_during_migration( timeline_id = env.initial_timeline env.neon_cli.create_tenant(tenant_id, timeline_id) + # The test stalls a reconcile on purpose to check if the long running + # reconcile alert fires. + env.storage_controller.allowed_errors.extend( + [".*Reconcile passed the long running threshold.*"] + ) + # Activate a failpoint that will cause live migration to get stuck _after_ the generation has been issued # to the new pageserver: this should result in requests routed to the new pageserver. env.storage_controller.configure_failpoints((migration_failpoint.value, "pause")) @@ -2655,6 +2667,24 @@ def test_storage_controller_proxy_during_migration( origin_pageserver = env.get_tenant_pageserver(tenant_id) dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + def long_migration_metric_published(): + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_long_running_total", + filter={"tenant_id": str(tenant_id), "shard_number": "0"}, + ) + == 1 + ) + + def assert_long_migration_metric_not_published(): + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_long_running_total", + filter={"tenant_id": str(tenant_id), "shard_number": "0"}, + ) + is None + ) + try: with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: migrate_fut = executor.submit( @@ -2685,9 +2715,14 @@ def test_storage_controller_proxy_during_migration( # We expect request to land on the origin assert tenant_info["generation"] == 1 + wait_until(10, 1, long_migration_metric_published) + # Eventually migration completes env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) migrate_fut.result() + + assert_long_migration_metric_not_published() + except: # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))