diff --git a/Cargo.lock b/Cargo.lock index f62026696e..f0dbdff3ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1874,6 +1874,12 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + [[package]] name = "digest" version = "0.10.7" @@ -3331,6 +3337,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json-structural-diff" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf" +dependencies = [ + "difflib", + "regex", + "serde_json", +] + [[package]] name = "jsonwebtoken" version = "9.2.0" @@ -6443,6 +6460,7 @@ dependencies = [ "humantime", "hyper 0.14.30", "itertools 0.10.5", + "json-structural-diff", "lasso", "measured", "metrics", diff --git a/Cargo.toml b/Cargo.toml index 7228623c6b..21310ce6ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -210,6 +210,7 @@ rustls-native-certs = "0.8" x509-parser = "0.16" whoami = "1.5.1" zerocopy = { version = "0.7", features = ["derive"] } +json-structural-diff = { version = "0.2.0" } ## TODO replace this with tracing env_logger = "0.10" diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 28d130d9e0..2bf89b7bfa 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -335,13 +335,21 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'checkpoint_distance' as an integer")?, - checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()), + checkpoint_timeout: settings + .remove("checkpoint_timeout") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'checkpoint_timeout' as duration")?, compaction_target_size: settings .remove("compaction_target_size") .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_target_size' as an integer")?, - compaction_period: settings.remove("compaction_period").map(|x| x.to_string()), + compaction_period: settings + .remove("compaction_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'compaction_period' as duration")?, compaction_threshold: settings .remove("compaction_threshold") .map(|x| x.parse::()) @@ -387,7 +395,10 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_horizon' as an integer")?, - gc_period: settings.remove("gc_period").map(|x| x.to_string()), + gc_period: settings.remove("gc_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'gc_period' as duration")?, image_creation_threshold: settings .remove("image_creation_threshold") .map(|x| x.parse::()) @@ -403,13 +414,20 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_preempt_threshold' as integer")?, - pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), + pitr_interval: settings.remove("pitr_interval") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'pitr_interval' as duration")?, walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'walreceiver_connect_timeout' as duration")?, lagging_wal_timeout: settings .remove("lagging_wal_timeout") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lagging_wal_timeout' as duration")?, max_lsn_wal_lag: settings .remove("max_lsn_wal_lag") .map(|x| x.parse::()) @@ -427,8 +445,14 @@ impl PageServerNode { .context("Failed to parse 'min_resident_size_override' as integer")?, evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") - .map(|x| x.to_string()), - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?, + heatmap_period: settings + .remove("heatmap_period") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'heatmap_period' as duration")?, lazy_slru_download: settings .remove("lazy_slru_download") .map(|x| x.parse::()) @@ -439,10 +463,15 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length: settings.remove("lsn_lease_length") + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length' as duration")?, lsn_lease_length_for_ts: settings .remove("lsn_lease_length_for_ts") - .map(|x| x.to_string()), + .map(humantime::parse_duration) + .transpose() + .context("Failed to parse 'lsn_lease_length_for_ts' as duration")?, timeline_offloading: settings .remove("timeline_offloading") .map(|x| x.parse::()) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 953ade83ad..40b86e4110 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -959,7 +959,7 @@ async fn main() -> anyhow::Result<()> { threshold: threshold.into(), }, )), - heatmap_period: Some("300s".to_string()), + heatmap_period: Some(Duration::from_secs(300)), ..Default::default() }, }) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index dd7bea2916..1164048229 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -526,9 +526,13 @@ pub struct TenantConfigPatch { #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] pub struct TenantConfig { pub checkpoint_distance: Option, - pub checkpoint_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Option, pub compaction_target_size: Option, - pub compaction_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub compaction_period: Option, pub compaction_threshold: Option, pub compaction_upper_limit: Option, // defer parsing compaction_algorithm, like eviction_policy @@ -539,22 +543,38 @@ pub struct TenantConfig { pub l0_flush_stall_threshold: Option, pub l0_flush_wait_upload: Option, pub gc_horizon: Option, - pub gc_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub gc_period: Option, pub image_creation_threshold: Option, - pub pitr_interval: Option, - pub walreceiver_connect_timeout: Option, - pub lagging_wal_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub pitr_interval: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub eviction_policy: Option, pub min_resident_size_override: Option, - pub evictions_low_residence_duration_metric_threshold: Option, - pub heatmap_period: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub heatmap_period: Option, pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, pub image_creation_preempt_threshold: Option, - pub lsn_lease_length: Option, - pub lsn_lease_length_for_ts: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Option, + #[serde(default)] + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Option, pub timeline_offloading: Option, pub wal_receiver_protocol_override: Option, pub rel_size_v2_enabled: Option, @@ -564,7 +584,10 @@ pub struct TenantConfig { } impl TenantConfig { - pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig { + pub fn apply_patch( + self, + patch: TenantConfigPatch, + ) -> Result { let Self { mut checkpoint_distance, mut checkpoint_timeout, @@ -604,11 +627,17 @@ impl TenantConfig { } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); - patch.checkpoint_timeout.apply(&mut checkpoint_timeout); + patch + .checkpoint_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut checkpoint_timeout); patch .compaction_target_size .apply(&mut compaction_target_size); - patch.compaction_period.apply(&mut compaction_period); + patch + .compaction_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut compaction_period); patch.compaction_threshold.apply(&mut compaction_threshold); patch .compaction_upper_limit @@ -626,15 +655,25 @@ impl TenantConfig { .apply(&mut l0_flush_stall_threshold); patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); patch.gc_horizon.apply(&mut gc_horizon); - patch.gc_period.apply(&mut gc_period); + patch + .gc_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut gc_period); patch .image_creation_threshold .apply(&mut image_creation_threshold); - patch.pitr_interval.apply(&mut pitr_interval); + patch + .pitr_interval + .map(|v| humantime::parse_duration(&v))? + .apply(&mut pitr_interval); patch .walreceiver_connect_timeout + .map(|v| humantime::parse_duration(&v))? .apply(&mut walreceiver_connect_timeout); - patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout); + patch + .lagging_wal_timeout + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lagging_wal_timeout); patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag); patch.eviction_policy.apply(&mut eviction_policy); patch @@ -642,8 +681,12 @@ impl TenantConfig { .apply(&mut min_resident_size_override); patch .evictions_low_residence_duration_metric_threshold + .map(|v| humantime::parse_duration(&v))? .apply(&mut evictions_low_residence_duration_metric_threshold); - patch.heatmap_period.apply(&mut heatmap_period); + patch + .heatmap_period + .map(|v| humantime::parse_duration(&v))? + .apply(&mut heatmap_period); patch.lazy_slru_download.apply(&mut lazy_slru_download); patch .timeline_get_throttle @@ -654,9 +697,13 @@ impl TenantConfig { patch .image_creation_preempt_threshold .apply(&mut image_creation_preempt_threshold); - patch.lsn_lease_length.apply(&mut lsn_lease_length); + patch + .lsn_lease_length + .map(|v| humantime::parse_duration(&v))? + .apply(&mut lsn_lease_length); patch .lsn_lease_length_for_ts + .map(|v| humantime::parse_duration(&v))? .apply(&mut lsn_lease_length_for_ts); patch.timeline_offloading.apply(&mut timeline_offloading); patch @@ -673,7 +720,7 @@ impl TenantConfig { .gc_compaction_ratio_percent .apply(&mut gc_compaction_ratio_percent); - Self { + Ok(Self { checkpoint_distance, checkpoint_timeout, compaction_target_size, @@ -709,7 +756,7 @@ impl TenantConfig { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, - } + }) } } @@ -2503,7 +2550,7 @@ mod tests { ..base.clone() }; - let patched = base.apply_patch(decoded.config); + let patched = base.apply_patch(decoded.config).unwrap(); assert_eq!(patched, expected); } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c6bcfdf2fb..ab4c4c935d 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -693,16 +693,15 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { /// This is a conversion from our internal tenant config object to the one used /// in external APIs. impl From for models::TenantConfig { + // TODO(vlad): These are now the same, but they have different serialization logic. + // Can we merge them? fn from(value: TenantConfOpt) -> Self { - fn humantime(d: Duration) -> String { - format!("{}s", d.as_secs()) - } Self { checkpoint_distance: value.checkpoint_distance, - checkpoint_timeout: value.checkpoint_timeout.map(humantime), + checkpoint_timeout: value.checkpoint_timeout, compaction_algorithm: value.compaction_algorithm, compaction_target_size: value.compaction_target_size, - compaction_period: value.compaction_period.map(humantime), + compaction_period: value.compaction_period, compaction_threshold: value.compaction_threshold, compaction_upper_limit: value.compaction_upper_limit, compaction_l0_first: value.compaction_l0_first, @@ -711,24 +710,23 @@ impl From for models::TenantConfig { l0_flush_stall_threshold: value.l0_flush_stall_threshold, l0_flush_wait_upload: value.l0_flush_wait_upload, gc_horizon: value.gc_horizon, - gc_period: value.gc_period.map(humantime), + gc_period: value.gc_period, image_creation_threshold: value.image_creation_threshold, - pitr_interval: value.pitr_interval.map(humantime), - walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), - lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), + pitr_interval: value.pitr_interval, + walreceiver_connect_timeout: value.walreceiver_connect_timeout, + lagging_wal_timeout: value.lagging_wal_timeout, max_lsn_wal_lag: value.max_lsn_wal_lag, eviction_policy: value.eviction_policy, min_resident_size_override: value.min_resident_size_override, evictions_low_residence_duration_metric_threshold: value - .evictions_low_residence_duration_metric_threshold - .map(humantime), - heatmap_period: value.heatmap_period.map(humantime), + .evictions_low_residence_duration_metric_threshold, + heatmap_period: value.heatmap_period, lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, image_creation_preempt_threshold: value.image_creation_preempt_threshold, - lsn_lease_length: value.lsn_lease_length.map(humantime), - lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), + lsn_lease_length: value.lsn_lease_length, + lsn_lease_length_for_ts: value.lsn_lease_length_for_ts, timeline_offloading: value.timeline_offloading, wal_receiver_protocol_override: value.wal_receiver_protocol_override, rel_size_v2_enabled: value.rel_size_v2_enabled, @@ -760,29 +758,10 @@ mod tests { assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap()); } - #[test] - fn test_try_from_models_tenant_config_err() { - let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5a".to_string()), - ..TenantConfig::default() - }; - - let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config); - - assert!( - tenant_conf_opt.is_err(), - "Suceeded to convert TenantConfig to TenantConfOpt" - ); - - let expected_error_str = - "lagging_wal_timeout: invalid value: string \"5a\", expected a duration"; - assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str); - } - #[test] fn test_try_from_models_tenant_config_success() { let tenant_config = models::TenantConfig { - lagging_wal_timeout: Some("5s".to_string()), + lagging_wal_timeout: Some(Duration::from_secs(5)), ..TenantConfig::default() }; diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 73dc1a5c10..08c80bc141 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -24,6 +24,7 @@ hex.workspace = true hyper0.workspace = true humantime.workspace = true itertools.workspace = true +json-structural-diff.workspace = true lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 48f0804926..ba3946fa3e 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,6 +1,7 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::{compute_hook, service}; +use json_structural_diff::JsonDiff; use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy}; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest, @@ -24,7 +25,7 @@ use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation}; -const DEFAULT_HEATMAP_PERIOD: &str = "60s"; +const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60); /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. @@ -880,7 +881,27 @@ impl Reconciler { self.generation = Some(generation); wanted_conf.generation = generation.into(); } - tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + let diff = match observed { + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let diff = JsonDiff::diff( + &serde_json::to_value(observed.clone()).unwrap(), + &serde_json::to_value(wanted_conf.clone()).unwrap(), + false, + ); + + if let Some(json_diff) = diff.diff { + serde_json::to_string(&json_diff).unwrap_or("diff err".to_string()) + } else { + "unknown".to_string() + } + } + _ => "full".to_string(), + }; + + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}"); // Because `node` comes from a ref to &self, clone it before calling into a &mut self // function: this could be avoided by refactoring the state mutated by location_config into @@ -1180,7 +1201,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig let mut config = config.clone(); if has_secondaries { if config.heatmap_period.is_none() { - config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD); } } else { config.heatmap_period = None; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 25a1cb4252..01ace40b84 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2921,7 +2921,9 @@ impl Service { first }; - let updated_config = base.apply_patch(patch); + let updated_config = base + .apply_patch(patch) + .map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?; self.set_tenant_config_and_reconcile(tenant_id, updated_config) .await } @@ -6649,11 +6651,12 @@ impl Service { ) -> Option { let reconcile_needed = shard.get_reconcile_needed(nodes); - match reconcile_needed { + let reconcile_reason = match reconcile_needed { ReconcileNeeded::No => return None, ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), - ReconcileNeeded::Yes => { + ReconcileNeeded::Yes(reason) => { // Fall through to try and acquire units for spawning reconciler + reason } }; @@ -6692,6 +6695,7 @@ impl Service { }; shard.spawn_reconciler( + reconcile_reason, &self.result_tx, nodes, &self.compute_hook, diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 219c0dffe7..56a36dc2df 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -481,7 +481,14 @@ pub(crate) enum ReconcileNeeded { /// spawned: wait for the existing reconciler rather than spawning a new one. WaitExisting(ReconcilerWaiter), /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] - Yes, + Yes(ReconcileReason), +} + +#[derive(Debug)] +pub(crate) enum ReconcileReason { + ActiveNodesDirty, + UnknownLocation, + PendingComputeNotification, } /// Pending modification to the observed state of a tenant shard. @@ -1341,12 +1348,18 @@ impl TenantShard { let active_nodes_dirty = self.dirty(pageservers); - // Even if there is no pageserver work to be done, if we have a pending notification to computes, - // wake up a reconciler to send it. - let do_reconcile = - active_nodes_dirty || dirty_observed || self.pending_compute_notification; + let reconcile_needed = match ( + active_nodes_dirty, + dirty_observed, + self.pending_compute_notification, + ) { + (true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty), + (_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation), + (_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification), + _ => ReconcileNeeded::No, + }; - if !do_reconcile { + if matches!(reconcile_needed, ReconcileNeeded::No) { tracing::debug!("Not dirty, no reconciliation needed."); return ReconcileNeeded::No; } @@ -1389,7 +1402,7 @@ impl TenantShard { } } - ReconcileNeeded::Yes + reconcile_needed } /// Ensure the sequence number is set to a value where waiting for this value will make us wait @@ -1479,6 +1492,7 @@ impl TenantShard { #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( &mut self, + reason: ReconcileReason, result_tx: &tokio::sync::mpsc::UnboundedSender, pageservers: &Arc>, compute_hook: &Arc, @@ -1538,7 +1552,7 @@ impl TenantShard { let reconcile_seq = self.sequence; let long_reconcile_threshold = service_config.long_reconcile_threshold; - tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); + tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})"); let must_notify = self.pending_compute_notification; let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, tenant_id=%reconciler.tenant_shard_id.tenant_id, diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 7e895422d2..d18cbb3393 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3817,3 +3817,43 @@ def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder): nodes = env.storage_controller.node_list() assert len(nodes) == 1 assert nodes[0]["listen_https_port"] is None + + +def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvBuilder): + """ + Validate that a storage controller restart with no shards in a transient state + performs zero reconciliations at start-up. Implicitly, this means that the location + configs returned by the pageserver are identical to the persisted state in the + storage controller database. + """ + neon_env_builder.num_pageservers = 1 + neon_env_builder.storage_controller_config = { + "start_as_candidate": False, + } + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, shard_count=2, tenant_config={"pitr_interval": "1h2m3s"} + ) + + env.storage_controller.reconcile_until_idle() + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_before_restart != 0 + + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.reconcile_until_idle() + + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + assert reconciles_after_restart == 0