mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 09:22:55 +00:00
part of https://github.com/neondatabase/neon/issues/5899 Problem ------- Before this PR, the time spent waiting on the throttle was charged towards the higher-level page_service metrics, i.e., `pageserver_smgr_query_seconds`. The metrics are the foundation of internal SLIs / SLOs. A throttled tenant would cause the SLI to degrade / SLO alerts to fire. Changes ------- - don't charge time spent in throttle towards the page_service metrics - record time spent in throttle in RequestContext and subtract it from the elapsed time - this works because the page_service path doesn't create child context, so, all the throttle time is recorded in the parent - it's quite brittle and will break if we ever decide to spawn child tasks that need child RequestContexts, which would have separate instances of the `micros_spent_throttled` counter. - however, let's punt that to a more general refactoring of RequestContext - add a test case that ensures that - throttling happens for getpage requests; this aspect of the test passed before this PR - throttling delays aren't charged towards the page_service metrics; this aspect of the test only passes with this PR - drive-by: make the throttle log message `info!`, it's an expected condition Performance ----------- I took the same measurements as in #6706 , no meaningful change in CPU overhead. Future Work ----------- This PR enables us to experiment with the throttle for select tenants without affecting the SLI metrics / triggering SLO alerts. Before declaring this feature done, we need more work to happen, specifically: - decide on whether we want to retain the flexibility of throttling any `Timeline::get` call, filtered by TaskKind - versus: separate throttles for each page_service endpoint, potentially with separate config options - the trouble here is that this decision implies changes to the TenantConfig, so, if we start using the current config style now, then decide to switch to a different config, it'll be a breaking change Nice-to-haves but probably not worth the time right now: - Equivalent tests to ensure the throttle applies to all other page_service handlers.
102 lines
2.8 KiB
Rust
102 lines
2.8 KiB
Rust
use std::{
|
|
sync::atomic::{AtomicU32, Ordering},
|
|
time::Duration,
|
|
};
|
|
|
|
#[derive(Debug)]
|
|
pub struct CounterU32 {
|
|
inner: AtomicU32,
|
|
}
|
|
impl Default for CounterU32 {
|
|
fn default() -> Self {
|
|
Self {
|
|
inner: AtomicU32::new(u32::MAX),
|
|
}
|
|
}
|
|
}
|
|
impl CounterU32 {
|
|
pub fn open(&self) -> Result<(), &'static str> {
|
|
match self
|
|
.inner
|
|
.compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
|
|
{
|
|
Ok(_) => Ok(()),
|
|
Err(_) => Err("open() called on clsoed state"),
|
|
}
|
|
}
|
|
pub fn close(&self) -> Result<u32, &'static str> {
|
|
match self.inner.swap(u32::MAX, Ordering::Relaxed) {
|
|
u32::MAX => Err("close() called on closed state"),
|
|
x => Ok(x),
|
|
}
|
|
}
|
|
|
|
pub fn add(&self, count: u32) -> Result<(), &'static str> {
|
|
if count == 0 {
|
|
return Ok(());
|
|
}
|
|
let mut had_err = None;
|
|
self.inner
|
|
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
|
|
u32::MAX => {
|
|
had_err = Some("add() called on closed state");
|
|
None
|
|
}
|
|
x => {
|
|
let (new, overflowed) = x.overflowing_add(count);
|
|
if new == u32::MAX || overflowed {
|
|
had_err = Some("add() overflowed the counter");
|
|
None
|
|
} else {
|
|
Some(new)
|
|
}
|
|
}
|
|
})
|
|
.map_err(|_| had_err.expect("we set it whenever the function returns None"))
|
|
.map(|_| ())
|
|
}
|
|
}
|
|
|
|
#[derive(Default, Debug)]
|
|
pub struct MicroSecondsCounterU32 {
|
|
inner: CounterU32,
|
|
}
|
|
|
|
impl MicroSecondsCounterU32 {
|
|
pub fn open(&self) -> Result<(), &'static str> {
|
|
self.inner.open()
|
|
}
|
|
pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
|
|
match duration.as_micros().try_into() {
|
|
Ok(x) => self.inner.add(x),
|
|
Err(_) => Err("add(): duration conversion error"),
|
|
}
|
|
}
|
|
pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
|
|
let val = self.inner.close()?;
|
|
let val = Duration::from_micros(val as u64);
|
|
let subbed = match from.checked_sub(val) {
|
|
Some(v) => v,
|
|
None => return Err("Duration::checked_sub"),
|
|
};
|
|
Ok(subbed)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_basic() {
|
|
let counter = MicroSecondsCounterU32::default();
|
|
counter.open().unwrap();
|
|
counter.add(Duration::from_micros(23)).unwrap();
|
|
let res = counter
|
|
.close_and_checked_sub_from(Duration::from_micros(42))
|
|
.unwrap();
|
|
assert_eq!(res, Duration::from_micros(42 - 23));
|
|
}
|
|
}
|