storcon: use Duration for duration's in the storage controller tenant config (#10928)

## Problem

The storage controller treats durations in the tenant config as strings.
These are loaded from the db.
The pageserver maps these durations to a seconds only format and we
always get a mismatch compared
to what's in the db.

## Summary of changes

Treat durations as durations inside the storage controller and not as
strings.
Nothing changes in the cross service API's themselves or the way things
are stored in the db.

I also added some logging which I would have made the investigation a
10min job:
1. Reason for why the reconciliation was spawned
2. Location config diff between the observed and wanted states
This commit is contained in:
Vlad Lazar
2025-02-21 15:45:00 +00:00
committed by GitHub
parent 5e3c234edc
commit 3e82addd64
11 changed files with 234 additions and 80 deletions

View File

@@ -1,6 +1,7 @@
use crate::pageserver_client::PageserverClient;
use crate::persistence::Persistence;
use crate::{compute_hook, service};
use json_structural_diff::JsonDiff;
use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy};
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
@@ -24,7 +25,7 @@ use crate::compute_hook::{ComputeHook, NotifyError};
use crate::node::Node;
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation};
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60);
/// Object with the lifetime of the background reconcile task that is created
/// for tenants which have a difference between their intent and observed states.
@@ -880,7 +881,27 @@ impl Reconciler {
self.generation = Some(generation);
wanted_conf.generation = generation.into();
}
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
let diff = match observed {
Some(ObservedStateLocation {
conf: Some(observed),
}) => {
let diff = JsonDiff::diff(
&serde_json::to_value(observed.clone()).unwrap(),
&serde_json::to_value(wanted_conf.clone()).unwrap(),
false,
);
if let Some(json_diff) = diff.diff {
serde_json::to_string(&json_diff).unwrap_or("diff err".to_string())
} else {
"unknown".to_string()
}
}
_ => "full".to_string(),
};
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}");
// Because `node` comes from a ref to &self, clone it before calling into a &mut self
// function: this could be avoided by refactoring the state mutated by location_config into
@@ -1180,7 +1201,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig
let mut config = config.clone();
if has_secondaries {
if config.heatmap_period.is_none() {
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD);
}
} else {
config.heatmap_period = None;

View File

@@ -2926,7 +2926,9 @@ impl Service {
first
};
let updated_config = base.apply_patch(patch);
let updated_config = base
.apply_patch(patch)
.map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?;
self.set_tenant_config_and_reconcile(tenant_id, updated_config)
.await
}
@@ -6654,11 +6656,12 @@ impl Service {
) -> Option<ReconcilerWaiter> {
let reconcile_needed = shard.get_reconcile_needed(nodes);
match reconcile_needed {
let reconcile_reason = match reconcile_needed {
ReconcileNeeded::No => return None,
ReconcileNeeded::WaitExisting(waiter) => return Some(waiter),
ReconcileNeeded::Yes => {
ReconcileNeeded::Yes(reason) => {
// Fall through to try and acquire units for spawning reconciler
reason
}
};
@@ -6697,6 +6700,7 @@ impl Service {
};
shard.spawn_reconciler(
reconcile_reason,
&self.result_tx,
nodes,
&self.compute_hook,

View File

@@ -481,7 +481,14 @@ pub(crate) enum ReconcileNeeded {
/// spawned: wait for the existing reconciler rather than spawning a new one.
WaitExisting(ReconcilerWaiter),
/// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
Yes,
Yes(ReconcileReason),
}
#[derive(Debug)]
pub(crate) enum ReconcileReason {
ActiveNodesDirty,
UnknownLocation,
PendingComputeNotification,
}
/// Pending modification to the observed state of a tenant shard.
@@ -1341,12 +1348,18 @@ impl TenantShard {
let active_nodes_dirty = self.dirty(pageservers);
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
// wake up a reconciler to send it.
let do_reconcile =
active_nodes_dirty || dirty_observed || self.pending_compute_notification;
let reconcile_needed = match (
active_nodes_dirty,
dirty_observed,
self.pending_compute_notification,
) {
(true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty),
(_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation),
(_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification),
_ => ReconcileNeeded::No,
};
if !do_reconcile {
if matches!(reconcile_needed, ReconcileNeeded::No) {
tracing::debug!("Not dirty, no reconciliation needed.");
return ReconcileNeeded::No;
}
@@ -1389,7 +1402,7 @@ impl TenantShard {
}
}
ReconcileNeeded::Yes
reconcile_needed
}
/// Ensure the sequence number is set to a value where waiting for this value will make us wait
@@ -1479,6 +1492,7 @@ impl TenantShard {
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn spawn_reconciler(
&mut self,
reason: ReconcileReason,
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
pageservers: &Arc<HashMap<NodeId, Node>>,
compute_hook: &Arc<ComputeHook>,
@@ -1538,7 +1552,7 @@ impl TenantShard {
let reconcile_seq = self.sequence;
let long_reconcile_threshold = service_config.long_reconcile_threshold;
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})");
let must_notify = self.pending_compute_notification;
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
tenant_id=%reconciler.tenant_shard_id.tenant_id,