Compare commits

...

2 Commits

Author SHA1 Message Date
John Spray
ed3e3b6f61 pageserver: enable setting a target disk range 2023-10-25 14:39:12 +01:00
John Spray
098ef0956b pageserver: publish disk eviction status 2023-10-25 14:35:32 +01:00
3 changed files with 122 additions and 28 deletions

View File

@@ -1479,6 +1479,8 @@ threshold = "20m"
Some(DiskUsageEvictionTaskConfig {
max_usage_pct: Percent::new(80).unwrap(),
min_avail_bytes: 0,
target_avail_bytes: None,
target_usage_pct: None,
period: Duration::from_secs(10),
#[cfg(feature = "testing")]
mock_statvfs: None,

View File

@@ -67,16 +67,40 @@ use crate::{
pub struct DiskUsageEvictionTaskConfig {
pub max_usage_pct: Percent,
pub min_avail_bytes: u64,
// Control how far we will go when evicting: when usage exceeds max_usage_pct or min_avail_bytes,
// we will keep evicting layers until we reach the target. The resulting disk usage should look
// like a sawtooth bouncing between the upper max/min line and the lower target line.
#[serde(default)]
pub target_usage_pct: Option<Percent>,
#[serde(default)]
pub target_avail_bytes: Option<u64>,
#[serde(with = "humantime_serde")]
pub period: Duration,
#[cfg(feature = "testing")]
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
}
#[derive(Default)]
enum Status {
/// We are within disk limits, and not currently doing any eviction
#[default]
Idle,
/// Disk limits have been exceeded: we will evict soon
UnderPressure,
/// We are currently doing an eviction pass.
Evicting,
}
#[derive(Default)]
pub struct State {
/// Exclude http requests and background task from running at the same time.
mutex: tokio::sync::Mutex<()>,
/// Publish the current status of eviction work, for visibility to other subsystems
/// that modify their behavior if disk pressure is high or if eviction is going on.
status: std::sync::RwLock<Status>,
}
pub fn launch_disk_usage_global_eviction_task(
@@ -176,7 +200,9 @@ async fn disk_usage_eviction_task(
}
pub trait Usage: Clone + Copy + std::fmt::Debug {
fn has_pressure(&self) -> bool;
fn pressure(&self) -> f64;
fn over_pressure(&self) -> bool;
fn no_pressure(&self) -> bool;
fn add_available_bytes(&mut self, bytes: u64);
}
@@ -189,13 +215,19 @@ async fn disk_usage_eviction_task_iteration(
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
if usage_pre.over_pressure() {
*state.status.write().unwrap() = Status::Evicting;
}
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
match outcome {
let new_status = match outcome {
IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
// nothing to do, select statement below will handle things
Status::Idle
}
IterationOutcome::Finished(outcome) => {
// Verify with statvfs whether we made any real progress
@@ -205,21 +237,30 @@ async fn disk_usage_eviction_task_iteration(
debug!(?after, "disk usage");
if after.has_pressure() {
if after.over_pressure() {
// Don't bother doing an out-of-order iteration here now.
// In practice, the task period is set to a value in the tens-of-seconds range,
// which will cause another iteration to happen soon enough.
// TODO: deltas between the three different usages would be helpful,
// consider MiB, GiB, TiB
warn!(?outcome, ?after, "disk usage still high");
Status::UnderPressure
} else {
info!(?outcome, ?after, "disk usage pressure relieved");
Status::Idle
}
}
}
};
*state.status.write().unwrap() = new_status;
}
Err(e) => {
error!("disk_usage_eviction_iteration failed: {:#}", e);
*state.status.write().unwrap() = if usage_pre.over_pressure() {
Status::UnderPressure
} else {
Status::Idle
};
}
}
@@ -285,8 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
debug!(?usage_pre, "disk usage");
if !usage_pre.has_pressure() {
if !usage_pre.over_pressure() {
return Ok(IterationOutcome::NoPressure);
} else {
*state.status.write().unwrap() = Status::Evicting;
}
warn!(
@@ -334,7 +377,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
let mut warned = None;
let mut usage_planned = usage_pre;
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
if !usage_planned.has_pressure() {
if usage_planned.no_pressure() {
debug!(
no_candidates_evicted = i,
"took enough candidates for pressure to be relieved"
@@ -644,22 +687,57 @@ mod filesystem_level_usage {
}
impl super::Usage for Usage<'_> {
fn has_pressure(&self) -> bool {
let usage_pct =
(100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
/// Does the pressure exceed 1.0, i.e. has the disk usage exceeded upper bounds?
///
/// This is the condition for starting eviction.
fn over_pressure(&self) -> bool {
self.pressure() >= 1.0
}
let pressures = [
(
"min_avail_bytes",
self.avail_bytes < self.config.min_avail_bytes,
),
(
"max_usage_pct",
usage_pct >= self.config.max_usage_pct.get() as u64,
),
];
/// Is the pressure <0, ie.. has disk usage gone below the target bound?
///
/// This is the condition for dropping out of eviction.
fn no_pressure(&self) -> bool {
self.pressure() <= 0.0
}
pressures.into_iter().any(|(_, has_pressure)| has_pressure)
fn pressure(&self) -> f64 {
let max_usage = std::cmp::min(
self.total_bytes - self.config.min_avail_bytes,
(self.total_bytes as f64 * (self.config.max_usage_pct.get() as f64 / 100.0)) as u64,
);
let mut target_usage = max_usage;
if let Some(target_avail_bytes) = self.config.target_avail_bytes {
target_usage = std::cmp::min(target_usage, self.total_bytes - target_avail_bytes);
}
if let Some(target_usage_pct) = self.config.target_usage_pct {
target_usage = std::cmp::min(
target_usage,
(self.total_bytes as f64 * (target_usage_pct.get() as f64 / 100.0)) as u64,
);
};
let usage = self.total_bytes - self.avail_bytes;
eprintln!(
"pressure: {} {}, current {}",
target_usage, max_usage, usage
);
if target_usage == max_usage {
// We are configured with a zero sized range: treat anything at+beyond limit as pressure 1.0, else 0.0
if usage >= max_usage {
1.0
} else {
0.0
}
} else if usage <= target_usage {
// No pressure.
0.0
} else {
// We are above target: pressure is the ratio of how much we exceed target to the size of the gap
let range_size = (max_usage - target_usage) as f64;
(usage - target_usage) as f64 / range_size
}
}
fn add_available_bytes(&mut self, bytes: u64) {
@@ -713,6 +791,8 @@ mod filesystem_level_usage {
config: &DiskUsageEvictionTaskConfig {
max_usage_pct: Percent::new(85).unwrap(),
min_avail_bytes: 0,
target_avail_bytes: None,
target_usage_pct: None,
period: Duration::MAX,
#[cfg(feature = "testing")]
mock_statvfs: None,
@@ -721,24 +801,24 @@ mod filesystem_level_usage {
avail_bytes: 0,
};
assert!(usage.has_pressure(), "expected pressure at 100%");
assert!(usage.over_pressure(), "expected pressure at 100%");
usage.add_available_bytes(14_000);
assert!(usage.has_pressure(), "expected pressure at 86%");
assert!(usage.over_pressure(), "expected pressure at 86%");
usage.add_available_bytes(999);
assert!(usage.has_pressure(), "expected pressure at 85.001%");
assert!(usage.over_pressure(), "expected pressure at 85.001%");
usage.add_available_bytes(1);
assert!(usage.has_pressure(), "expected pressure at precisely 85%");
assert!(usage.over_pressure(), "expected pressure at precisely 85%");
usage.add_available_bytes(1);
assert!(!usage.has_pressure(), "no pressure at 84.999%");
assert!(!usage.over_pressure(), "no pressure at 84.999%");
usage.add_available_bytes(999);
assert!(!usage.has_pressure(), "no pressure at 84%");
assert!(!usage.over_pressure(), "no pressure at 84%");
usage.add_available_bytes(16_000);
assert!(!usage.has_pressure());
assert!(!usage.over_pressure());
}
}

View File

@@ -1452,10 +1452,22 @@ async fn disk_usage_eviction_run(
}
impl crate::disk_usage_eviction_task::Usage for Usage {
fn has_pressure(&self) -> bool {
fn over_pressure(&self) -> bool {
self.config.evict_bytes > self.freed_bytes
}
fn no_pressure(&self) -> bool {
!self.over_pressure()
}
fn pressure(&self) -> f64 {
if self.over_pressure() {
1.0
} else {
0.0
}
}
fn add_available_bytes(&mut self, bytes: u64) {
self.freed_bytes += bytes;
}