mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-19 06:00:38 +00:00
storcon: use Duration for duration's in the storage controller tenant config (#10928)
## Problem The storage controller treats durations in the tenant config as strings. These are loaded from the db. The pageserver maps these durations to a seconds only format and we always get a mismatch compared to what's in the db. ## Summary of changes Treat durations as durations inside the storage controller and not as strings. Nothing changes in the cross service API's themselves or the way things are stored in the db. I also added some logging which I would have made the investigation a 10min job: 1. Reason for why the reconciliation was spawned 2. Location config diff between the observed and wanted states
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
use crate::pageserver_client::PageserverClient;
|
||||
use crate::persistence::Persistence;
|
||||
use crate::{compute_hook, service};
|
||||
use json_structural_diff::JsonDiff;
|
||||
use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy};
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
|
||||
@@ -24,7 +25,7 @@ use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation};
|
||||
|
||||
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
|
||||
const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60);
|
||||
|
||||
/// Object with the lifetime of the background reconcile task that is created
|
||||
/// for tenants which have a difference between their intent and observed states.
|
||||
@@ -880,7 +881,27 @@ impl Reconciler {
|
||||
self.generation = Some(generation);
|
||||
wanted_conf.generation = generation.into();
|
||||
}
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
|
||||
|
||||
let diff = match observed {
|
||||
Some(ObservedStateLocation {
|
||||
conf: Some(observed),
|
||||
}) => {
|
||||
let diff = JsonDiff::diff(
|
||||
&serde_json::to_value(observed.clone()).unwrap(),
|
||||
&serde_json::to_value(wanted_conf.clone()).unwrap(),
|
||||
false,
|
||||
);
|
||||
|
||||
if let Some(json_diff) = diff.diff {
|
||||
serde_json::to_string(&json_diff).unwrap_or("diff err".to_string())
|
||||
} else {
|
||||
"unknown".to_string()
|
||||
}
|
||||
}
|
||||
_ => "full".to_string(),
|
||||
};
|
||||
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}");
|
||||
|
||||
// Because `node` comes from a ref to &self, clone it before calling into a &mut self
|
||||
// function: this could be avoided by refactoring the state mutated by location_config into
|
||||
@@ -1180,7 +1201,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig
|
||||
let mut config = config.clone();
|
||||
if has_secondaries {
|
||||
if config.heatmap_period.is_none() {
|
||||
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
|
||||
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD);
|
||||
}
|
||||
} else {
|
||||
config.heatmap_period = None;
|
||||
|
||||
@@ -2926,7 +2926,9 @@ impl Service {
|
||||
first
|
||||
};
|
||||
|
||||
let updated_config = base.apply_patch(patch);
|
||||
let updated_config = base
|
||||
.apply_patch(patch)
|
||||
.map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?;
|
||||
self.set_tenant_config_and_reconcile(tenant_id, updated_config)
|
||||
.await
|
||||
}
|
||||
@@ -6654,11 +6656,12 @@ impl Service {
|
||||
) -> Option<ReconcilerWaiter> {
|
||||
let reconcile_needed = shard.get_reconcile_needed(nodes);
|
||||
|
||||
match reconcile_needed {
|
||||
let reconcile_reason = match reconcile_needed {
|
||||
ReconcileNeeded::No => return None,
|
||||
ReconcileNeeded::WaitExisting(waiter) => return Some(waiter),
|
||||
ReconcileNeeded::Yes => {
|
||||
ReconcileNeeded::Yes(reason) => {
|
||||
// Fall through to try and acquire units for spawning reconciler
|
||||
reason
|
||||
}
|
||||
};
|
||||
|
||||
@@ -6697,6 +6700,7 @@ impl Service {
|
||||
};
|
||||
|
||||
shard.spawn_reconciler(
|
||||
reconcile_reason,
|
||||
&self.result_tx,
|
||||
nodes,
|
||||
&self.compute_hook,
|
||||
|
||||
@@ -481,7 +481,14 @@ pub(crate) enum ReconcileNeeded {
|
||||
/// spawned: wait for the existing reconciler rather than spawning a new one.
|
||||
WaitExisting(ReconcilerWaiter),
|
||||
/// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
|
||||
Yes,
|
||||
Yes(ReconcileReason),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum ReconcileReason {
|
||||
ActiveNodesDirty,
|
||||
UnknownLocation,
|
||||
PendingComputeNotification,
|
||||
}
|
||||
|
||||
/// Pending modification to the observed state of a tenant shard.
|
||||
@@ -1341,12 +1348,18 @@ impl TenantShard {
|
||||
|
||||
let active_nodes_dirty = self.dirty(pageservers);
|
||||
|
||||
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
|
||||
// wake up a reconciler to send it.
|
||||
let do_reconcile =
|
||||
active_nodes_dirty || dirty_observed || self.pending_compute_notification;
|
||||
let reconcile_needed = match (
|
||||
active_nodes_dirty,
|
||||
dirty_observed,
|
||||
self.pending_compute_notification,
|
||||
) {
|
||||
(true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty),
|
||||
(_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation),
|
||||
(_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification),
|
||||
_ => ReconcileNeeded::No,
|
||||
};
|
||||
|
||||
if !do_reconcile {
|
||||
if matches!(reconcile_needed, ReconcileNeeded::No) {
|
||||
tracing::debug!("Not dirty, no reconciliation needed.");
|
||||
return ReconcileNeeded::No;
|
||||
}
|
||||
@@ -1389,7 +1402,7 @@ impl TenantShard {
|
||||
}
|
||||
}
|
||||
|
||||
ReconcileNeeded::Yes
|
||||
reconcile_needed
|
||||
}
|
||||
|
||||
/// Ensure the sequence number is set to a value where waiting for this value will make us wait
|
||||
@@ -1479,6 +1492,7 @@ impl TenantShard {
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn spawn_reconciler(
|
||||
&mut self,
|
||||
reason: ReconcileReason,
|
||||
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
|
||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
@@ -1538,7 +1552,7 @@ impl TenantShard {
|
||||
let reconcile_seq = self.sequence;
|
||||
let long_reconcile_threshold = service_config.long_reconcile_threshold;
|
||||
|
||||
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
|
||||
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})");
|
||||
let must_notify = self.pending_compute_notification;
|
||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
|
||||
Reference in New Issue
Block a user