grand refactor of SmgrOpTimer states

This commit is contained in:
Christian Schwarz
2025-01-10 19:47:27 +01:00
parent ad5120197c
commit d6a2b62cfb
6 changed files with 171 additions and 249 deletions

View File

@@ -1233,6 +1233,8 @@ pub(crate) struct SmgrOpTimerInner {
global_flush_in_progress_micros: IntCounter,
per_timeline_flush_in_progress_micros: IntCounter,
throttling: Arc<tenant_throttling::Pagestream>,
timings: SmgrOpTimerState,
}
@@ -1245,132 +1247,105 @@ enum SmgrOpTimerState {
#[allow(dead_code)]
received_at: Instant,
},
ParsedRoutedThrottledNowBatching {
ParsedRoutedNowThrottling {
throttle_started_at: Instant,
},
ParsedRoutedThrottledNowBatching {
throttle_done_at: Instant,
},
BatchedNowExecuting {
throttle_started_at: Instant,
throttle_done_at: Instant,
execution_started_at: Instant,
},
Flushing,
// NB: when adding observation points, remember to update the Drop impl.
}
pub(crate) struct SmgrOpFlushInProgress {
flush_started_at: Instant,
global_micros: IntCounter,
per_timeline_micros: IntCounter,
}
// NB: when adding observation points, remember to update the Drop impl.
impl SmgrOpTimer {
pub(crate) fn observe_throttle_done(&mut self, throttle: &ThrottleResult) {
let inner = self.0.as_mut().expect("other public methods consume self");
match (&mut inner.timings, throttle) {
(SmgrOpTimerState::Received { received_at: _ }, throttle) => match throttle {
ThrottleResult::NotThrottled { start } => {
inner.timings = SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
throttle_started_at: *start,
throttle_done_at: *start,
};
}
ThrottleResult::Throttled { start, end } => {
inner.timings = SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
throttle_started_at: *start,
throttle_done_at: *end,
};
}
},
(x, _) => panic!("called in unexpected state: {x:?}"),
pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
let Some(inner) = self.0.as_mut() else {
return;
};
let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
return;
};
inner.throttling.count_accounted_start.inc();
inner.timings = SmgrOpTimerState::ParsedRoutedNowThrottling {
throttle_started_at: at,
};
}
pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
let Some(inner) = self.0.as_mut() else {
return;
};
let SmgrOpTimerState::ParsedRoutedNowThrottling {
throttle_started_at,
} = &mut inner.timings
else {
return;
};
inner.throttling.count_accounted_finish.inc();
match throttle {
ThrottleResult::NotThrottled { end } => {
inner.timings = SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
throttle_done_at: end,
};
}
ThrottleResult::Throttled { end } => {
// update metrics
inner.throttling.count_throttled.inc();
inner
.throttling
.wait_time
.inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
// state transition
inner.timings = SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
throttle_done_at: end,
};
}
}
}
pub(crate) fn observe_execution_start(&mut self, at: Instant) {
let inner = self.0.as_mut().expect("other public methods consume self");
match &mut inner.timings {
SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
throttle_started_at,
throttle_done_at,
} => {
inner.timings = SmgrOpTimerState::BatchedNowExecuting {
throttle_started_at: *throttle_started_at,
throttle_done_at: *throttle_done_at,
execution_started_at: at,
};
}
x => panic!("called in unexpected state: {x:?}"),
}
}
pub(crate) fn observe_execution_end_flush_start(mut self) -> SmgrOpFlushInProgress {
assert!(
matches!(
self.0.as_ref().unwrap().timings,
SmgrOpTimerState::BatchedNowExecuting { .. }
),
"called in unexpected state: {:?}",
self.0.as_ref().unwrap().timings,
);
let (flush_start, inner) = self
.smgr_op_end()
.expect("this method consume self, and the only other caller is drop handler");
let SmgrOpTimerInner {
global_flush_in_progress_micros,
per_timeline_flush_in_progress_micros,
..
} = inner;
SmgrOpFlushInProgress {
flush_started_at: flush_start,
global_micros: global_flush_in_progress_micros,
per_timeline_micros: per_timeline_flush_in_progress_micros,
}
}
/// Returns `None`` if this method has already been called, `Some` otherwise.
fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
let inner = self.0.take()?;
let now = Instant::now();
// TODO: use label for unfinished requests instead of Duration::ZERO.
// This is quite rare in practice, only during tenant/pageservers shutdown.
let throttle;
let batch;
let execution;
match inner.timings {
SmgrOpTimerState::Received { received_at: _ } => {
throttle = Duration::ZERO;
batch = Duration::ZERO;
execution = Duration::ZERO;
}
SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
throttle_started_at,
throttle_done_at,
} => {
throttle = throttle_done_at - throttle_started_at;
batch = Duration::ZERO;
execution = Duration::ZERO;
}
SmgrOpTimerState::BatchedNowExecuting {
throttle_started_at,
throttle_done_at,
execution_started_at,
} => {
throttle = throttle_done_at - throttle_started_at;
batch = throttle_done_at - execution_started_at;
execution = now - execution_started_at;
}
}
// update time spent in batching
let Some(inner) = self.0.as_mut() else {
return;
};
let SmgrOpTimerState::ParsedRoutedThrottledNowBatching { throttle_done_at } =
&mut inner.timings
else {
return;
};
// update metrics
let batch = at - *throttle_done_at;
inner.global_batch_wait_time.observe(batch.as_secs_f64());
inner
.per_timeline_batch_wait_time
.observe(batch.as_secs_f64());
// state transition
inner.timings = SmgrOpTimerState::BatchedNowExecuting {
execution_started_at: at,
}
}
// time spent in throttle metric is updated by throttle impl
let _ = throttle;
// update metrics for execution latency
/// For all but the first caller, this is a no-op.
/// The first callers receives Some, subsequent ones None.
pub(crate) fn observe_execution_end_flush_start(
&mut self,
at: Instant,
) -> Option<SmgrOpFlushInProgress> {
// NB: unlike the other observe_* methods, this one take()s.
#[allow(clippy::question_mark)] // maintain similar code pattern.
let Some(mut inner) = self.0.take() else {
return None;
};
let SmgrOpTimerState::BatchedNowExecuting {
execution_started_at,
} = &mut inner.timings
else {
return None;
};
// update metrics
let execution = at - *execution_started_at;
inner
.global_execution_latency_histo
.observe(execution.as_secs_f64());
@@ -1380,13 +1355,45 @@ impl SmgrOpTimer {
per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
}
Some((now, inner))
// state transition
inner.timings = SmgrOpTimerState::Flushing;
// return the flush in progress object which
// will do the remaining metrics updates
let SmgrOpTimerInner {
global_flush_in_progress_micros,
per_timeline_flush_in_progress_micros,
..
} = inner;
Some(SmgrOpFlushInProgress {
flush_started_at: at,
global_micros: global_flush_in_progress_micros,
per_timeline_micros: per_timeline_flush_in_progress_micros,
})
}
}
pub(crate) struct SmgrOpFlushInProgress {
flush_started_at: Instant,
global_micros: IntCounter,
per_timeline_micros: IntCounter,
}
impl Drop for SmgrOpTimer {
fn drop(&mut self) {
self.smgr_op_end();
// In case of early drop, update any of the remaining metrics with
// observations so that (started,finished) counter pairs balance out
// and all counters on the latency path have the the same number of
// observations.
// It's technically lying and it would be better if each metric had
// a separate label or similar for cancelled requests.
// But we don't have that right now and counter pairs balancing
// out is useful when using the metrics in panels and whatnot.
let now = Instant::now();
self.observe_throttle_start(now);
self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
self.observe_execution_start(now);
self.observe_execution_end_flush_start(now);
}
}
@@ -1397,12 +1404,12 @@ impl SmgrOpFlushInProgress {
{
let mut fut = std::pin::pin!(fut);
let now = Instant::now();
// Whenever observe_guard gets called, or dropped,
// it adds the time elapsed since its last call to metrics.
// Last call is tracked in `now`.
let mut observe_guard = scopeguard::guard(
|| {
let now = Instant::now();
let elapsed = now - self.flush_started_at;
self.global_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
@@ -1445,7 +1452,6 @@ pub enum SmgrQueryType {
GetSlruSegment,
}
#[derive(Debug)]
pub(crate) struct SmgrQueryTimePerTimeline {
global_started: [IntCounter; SmgrQueryType::COUNT],
global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1457,6 +1463,7 @@ pub(crate) struct SmgrQueryTimePerTimeline {
per_timeline_flush_in_progress_micros: IntCounter,
global_batch_wait_time: Histogram,
per_timeline_batch_wait_time: Histogram,
throttling: Arc<tenant_throttling::Pagestream>,
}
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1662,7 +1669,11 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(||
});
impl SmgrQueryTimePerTimeline {
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
pub(crate) fn new(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
let timeline_id = timeline_id.to_string();
@@ -1723,6 +1734,7 @@ impl SmgrQueryTimePerTimeline {
per_timeline_flush_in_progress_micros,
global_batch_wait_time,
per_timeline_batch_wait_time,
throttling: pagestream_throttle_metrics,
}
}
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
@@ -1738,88 +1750,24 @@ impl SmgrQueryTimePerTimeline {
SmgrOpTimer(Some(SmgrOpTimerInner {
global_execution_latency_histo: self.global_latency[op as usize].clone(),
per_timeline_execution_latency_histo: per_timeline_latency_histo,
timings: SmgrOpTimerState::Received { received_at },
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
per_timeline_flush_in_progress_micros: self
.per_timeline_flush_in_progress_micros
.clone(),
global_batch_wait_time: self.global_batch_wait_time.clone(),
per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
throttling: self.throttling.clone(),
timings: SmgrOpTimerState::Received { received_at },
}))
}
/// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
self.global_batch_size.observe(batch_size as f64);
self.per_timeline_batch_size.observe(batch_size as f64);
}
}
#[cfg(test)]
mod smgr_query_time_tests {
use std::time::Instant;
use pageserver_api::shard::TenantShardId;
use strum::IntoEnumIterator;
use utils::id::{TenantId, TimelineId};
// Regression test, we used hard-coded string constants before using an enum.
#[test]
fn op_label_name() {
use super::SmgrQueryType::*;
let expect: [(super::SmgrQueryType, &'static str); 5] = [
(GetRelExists, "get_rel_exists"),
(GetRelSize, "get_rel_size"),
(GetPageAtLsn, "get_page_at_lsn"),
(GetDbSize, "get_db_size"),
(GetSlruSegment, "get_slru_segment"),
];
for (op, expect) in expect {
let actual: &'static str = op.into();
assert_eq!(actual, expect);
}
}
#[test]
fn basic() {
let ops: Vec<_> = super::SmgrQueryType::iter().collect();
for op in &ops {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let metrics = super::SmgrQueryTimePerTimeline::new(
&TenantShardId::unsharded(tenant_id),
&timeline_id,
);
let get_counts = || {
let global: u64 = ops
.iter()
.map(|op| metrics.global_latency[*op as usize].get_sample_count())
.sum();
(
global,
metrics.per_timeline_getpage_latency.get_sample_count(),
)
};
let (pre_global, pre_per_tenant_timeline) = get_counts();
assert_eq!(pre_per_tenant_timeline, 0);
let timer = metrics.start_smgr_op(*op, Instant::now());
drop(timer);
let (post_global, post_per_tenant_timeline) = get_counts();
if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
// getpage ops are tracked per-timeline, others aren't
assert_eq!(post_per_tenant_timeline, 1);
} else {
assert_eq!(post_per_tenant_timeline, 0);
}
assert!(post_global > pre_global);
}
}
}
// keep in sync with control plane Go code so that we can validate
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
@@ -3606,9 +3554,7 @@ pub(crate) mod tenant_throttling {
use once_cell::sync::Lazy;
use utils::shard::TenantShardId;
use crate::tenant::{self};
struct GlobalAndPerTenantIntCounter {
pub(crate) struct GlobalAndPerTenantIntCounter {
global: IntCounter,
per_tenant: IntCounter,
}
@@ -3626,10 +3572,10 @@ pub(crate) mod tenant_throttling {
}
pub(crate) struct Metrics<const KIND: usize> {
count_accounted_start: GlobalAndPerTenantIntCounter,
count_accounted_finish: GlobalAndPerTenantIntCounter,
wait_time: GlobalAndPerTenantIntCounter,
count_throttled: GlobalAndPerTenantIntCounter,
pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
pub(super) wait_time: GlobalAndPerTenantIntCounter,
pub(super) count_throttled: GlobalAndPerTenantIntCounter,
}
static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
@@ -3764,26 +3710,6 @@ pub(crate) mod tenant_throttling {
}
}
}
impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
#[inline(always)]
fn accounting_start(&self) {
self.count_accounted_start.inc();
}
#[inline(always)]
fn accounting_finish(&self) {
self.count_accounted_finish.inc();
}
#[inline(always)]
fn observe_throttling(
&self,
tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
) {
let val = u64::try_from(wait_time.as_micros()).unwrap();
self.wait_time.inc_by(val);
self.count_throttled.inc();
}
}
}
pub(crate) mod disk_usage_based_eviction {

View File

@@ -688,11 +688,13 @@ impl PageServerHandler {
// so that the _started counters are incremented before we do
// any serious waiting, e.g., for throttle, batching, or actual request handling.
let mut timer = shard.query_metrics.start_smgr_op(op, received_at);
let now = Instant::now();
timer.observe_throttle_start(now);
let throttled = tokio::select! {
res = shard.pagestream_throttle.throttle(1) => res,
res = shard.pagestream_throttle.throttle(1, now) => res,
_ = shard.cancel.cancelled() => return Err(QueryError::Shutdown),
};
timer.observe_throttle_done(&throttled);
timer.observe_throttle_done(throttled);
Ok(timer)
}
@@ -1092,7 +1094,11 @@ impl PageServerHandler {
// The timer's underlying metric is used for a storage-internal latency SLO and
// we don't want to include latency in it that we can't control.
// And as pointed out above, in this case, we don't control the time that flush will take.
let flushing_timer = timer.map(|timer| timer.observe_execution_end_flush_start());
let flushing_timer = timer.map(|mut timer| {
timer
.observe_execution_end_flush_start(Instant::now())
.expect("we are the first caller")
});
// what we want to do
let flush_fut = pgb_writer.flush();

View File

@@ -369,8 +369,9 @@ pub struct Tenant {
/// Throttle applied at the top of [`Timeline::get`].
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
pub(crate) pagestream_throttle:
Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
/// An ongoing timeline detach concurrency limiter.
///
@@ -1691,6 +1692,7 @@ impl Tenant {
TimelineResources {
remote_client,
pagestream_throttle: self.pagestream_throttle.clone(),
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
},
LoadTimelineCause::Attach,
@@ -4084,6 +4086,9 @@ impl Tenant {
Ok(timeline)
}
/// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
/// to ensure proper cleanup of background tasks and metrics.
//
// Allow too_many_arguments because a constructor's argument list naturally grows with the
// number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
#[allow(clippy::too_many_arguments)]
@@ -4192,8 +4197,10 @@ impl Tenant {
gate: Gate::default(),
pagestream_throttle: Arc::new(throttle::Throttle::new(
Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
)),
pagestream_throttle_metrics: Arc::new(
crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(),
gc_block: Default::default(),
@@ -5100,6 +5107,7 @@ impl Tenant {
TimelineResources {
remote_client: self.build_timeline_remote_client(timeline_id),
pagestream_throttle: self.pagestream_throttle.clone(),
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
}
}

View File

@@ -3,7 +3,7 @@ use std::{
atomic::{AtomicU64, Ordering},
Arc,
},
time::{Duration, Instant},
time::Instant,
};
use arc_swap::ArcSwap;
@@ -16,9 +16,8 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
/// To share a throttle among multiple entities, wrap it in an [`Arc`].
///
/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
pub struct Throttle<M: Metric> {
pub struct Throttle {
inner: ArcSwap<Inner>,
metric: M,
/// will be turned into [`Stats::count_accounted_start`]
count_accounted_start: AtomicU64,
/// will be turned into [`Stats::count_accounted_finish`]
@@ -36,15 +35,6 @@ pub struct Inner {
pub type Config = pageserver_api::models::ThrottleConfig;
pub struct Observation {
pub wait_time: Duration,
}
pub trait Metric {
fn accounting_start(&self);
fn accounting_finish(&self);
fn observe_throttling(&self, observation: &Observation);
}
/// See [`Throttle::reset_stats`].
pub struct Stats {
/// Number of requests that started [`Throttle::throttle`] calls.
@@ -59,18 +49,14 @@ pub struct Stats {
}
pub enum ThrottleResult {
NotThrottled { start: Instant },
Throttled { start: Instant, end: Instant },
NotThrottled { end: Instant },
Throttled { end: Instant },
}
impl<M> Throttle<M>
where
M: Metric,
{
pub fn new(config: Config, metric: M) -> Self {
impl Throttle {
pub fn new(config: Config) -> Self {
Self {
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
metric,
count_accounted_start: AtomicU64::new(0),
count_accounted_finish: AtomicU64::new(0),
count_throttled: AtomicU64::new(0),
@@ -127,32 +113,27 @@ where
self.inner.load().rate_limiter.steady_rps()
}
pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
/// `start` must be [`Instant::now`] or earlier.
pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult {
let inner = self.inner.load_full(); // clones the `Inner` Arc
let start = std::time::Instant::now();
if !inner.enabled {
return ThrottleResult::NotThrottled { start };
return ThrottleResult::NotThrottled { end: start };
}
self.metric.accounting_start();
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
let did_throttle = inner.rate_limiter.acquire(key_count).await;
self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
self.metric.accounting_finish();
if did_throttle {
self.count_throttled.fetch_add(1, Ordering::Relaxed);
let now = Instant::now();
let wait_time = now - start;
let end = Instant::now();
let wait_time = end - start;
self.sum_throttled_usecs
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
let observation = Observation { wait_time };
self.metric.observe_throttling(&observation);
ThrottleResult::Throttled { start, end: now }
ThrottleResult::Throttled { end }
} else {
ThrottleResult::NotThrottled { start }
ThrottleResult::NotThrottled { end: start }
}
}
}

View File

@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: RemoteTimelineClient,
pub pagestream_throttle:
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
}
@@ -412,8 +412,7 @@ pub struct Timeline {
gc_lock: tokio::sync::Mutex<()>,
/// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
pub(crate) pagestream_throttle:
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
/// Size estimator for aux file v2
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
@@ -2310,6 +2309,7 @@ impl Timeline {
query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
&tenant_shard_id,
&timeline_id,
resources.pagestream_throttle_metrics,
),
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),

View File

@@ -299,6 +299,7 @@ impl DeleteTimelineFlow {
TimelineResources {
remote_client,
pagestream_throttle: tenant.pagestream_throttle.clone(),
pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
},
// Important. We dont pass ancestor above because it can be missing.