mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 12:10:37 +00:00
656 lines
23 KiB
Rust
656 lines
23 KiB
Rust
//! This module implements the pageserver-global disk-usage-based layer eviction task.
|
|
//!
|
|
//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
|
|
//! loop that evicts layers in response to a shortage of available bytes
|
|
//! in the $repo/tenants directory's filesystem.
|
|
//!
|
|
//! The loop runs periodically at a configurable `period`.
|
|
//!
|
|
//! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
|
|
//! It compares the returned usage data against two different types of thresholds.
|
|
//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
|
|
//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
|
|
//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
|
|
//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
|
|
//!
|
|
//! There are two thresholds:
|
|
//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
|
|
//! If the actual usage is higher, the threshold is exceeded.
|
|
//! `min_avail_bytes` is the absolute available space in bytes.
|
|
//! If the actual usage is lower, the threshold is exceeded.
|
|
//!
|
|
//! The iteration evicts layers in LRU fashion.
|
|
//! It tries first with a reservation of up to `tenant_min_resident_size` bytes of the most recent layers per tenant.
|
|
//! The layers not part of the per-tenant reservation are evicted least-recently-used first until we're below all thresholds.
|
|
//! If the per-tenant-reservation strategy doesn't work out, it falls back to global LRU.
|
|
use std::{
|
|
collections::HashMap,
|
|
ops::ControlFlow,
|
|
sync::{Arc, Mutex},
|
|
time::Duration,
|
|
};
|
|
|
|
use anyhow::Context;
|
|
use nix::dir::Dir;
|
|
use remote_storage::GenericRemoteStorage;
|
|
use serde::{Deserialize, Serialize};
|
|
use sync_wrapper::SyncWrapper;
|
|
use tokio::time::Instant;
|
|
use tokio_util::sync::CancellationToken;
|
|
use tracing::{debug, error, info, instrument, warn, Instrument};
|
|
use utils::{approx_accurate::ApproxAccurate, id::TenantId, serde_percent::Percent};
|
|
|
|
use crate::{
|
|
config::PageServerConf,
|
|
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
|
tenant::{self, LocalLayerInfoForDiskUsageEviction, Timeline},
|
|
};
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
|
pub struct DiskUsageEvictionTaskConfig {
|
|
pub max_usage_pct: Percent,
|
|
pub min_avail_bytes: u64,
|
|
#[serde(with = "humantime_serde")]
|
|
pub period: Duration,
|
|
}
|
|
|
|
pub fn launch_disk_usage_global_eviction_task(
|
|
conf: &'static PageServerConf,
|
|
storage: GenericRemoteStorage,
|
|
) -> anyhow::Result<()> {
|
|
let Some(task_config) = &conf.disk_usage_based_eviction else {
|
|
info!("disk usage based eviction task not configured");
|
|
return Ok(());
|
|
};
|
|
|
|
let tenants_dir_fd = {
|
|
let tenants_path = conf.tenants_path();
|
|
nix::dir::Dir::open(
|
|
&tenants_path,
|
|
nix::fcntl::OFlag::O_DIRECTORY,
|
|
nix::sys::stat::Mode::empty(),
|
|
)
|
|
.with_context(|| format!("open tenants_path {tenants_path:?}"))?
|
|
};
|
|
|
|
info!("launching disk usage based eviction task");
|
|
|
|
task_mgr::spawn(
|
|
BACKGROUND_RUNTIME.handle(),
|
|
TaskKind::DiskUsageEviction,
|
|
None,
|
|
None,
|
|
"disk usage based eviction",
|
|
false,
|
|
async move {
|
|
disk_usage_eviction_task(
|
|
task_config,
|
|
storage,
|
|
tenants_dir_fd,
|
|
task_mgr::shutdown_token(),
|
|
)
|
|
.await;
|
|
info!("disk usage based eviction task finishing");
|
|
Ok(())
|
|
},
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[instrument(skip_all)]
|
|
async fn disk_usage_eviction_task(
|
|
task_config: &DiskUsageEvictionTaskConfig,
|
|
storage: GenericRemoteStorage,
|
|
tenants_dir_fd: Dir,
|
|
cancel: CancellationToken,
|
|
) {
|
|
// nix::dir::Dir is Send but not Sync.
|
|
// One would think that that is sufficient, but rustc complains that the &tenants_dir_fd
|
|
// that we pass to disk_usage_eviction_iteration below will outlive the .await;
|
|
// The reason is that the &tenants_dir_fd is not sync because of stdlib-enforced axiom
|
|
// T: Sync <=> &T: Send
|
|
// The solution is to use SyncWrapper, which, by owning the tenants_dir_fd, can impl Sync.
|
|
let mut tenants_dir_fd = SyncWrapper::new(tenants_dir_fd);
|
|
|
|
use crate::tenant::tasks::random_init_delay;
|
|
{
|
|
if random_init_delay(task_config.period, &cancel)
|
|
.await
|
|
.is_err()
|
|
{
|
|
info!("shutting down");
|
|
return;
|
|
}
|
|
}
|
|
|
|
let mut iteration_no = 0;
|
|
loop {
|
|
iteration_no += 1;
|
|
let start = Instant::now();
|
|
|
|
async {
|
|
let res = disk_usage_eviction_task_iteration(
|
|
task_config,
|
|
&storage,
|
|
&mut tenants_dir_fd,
|
|
&cancel,
|
|
)
|
|
.await;
|
|
|
|
match res {
|
|
Ok(()) => {}
|
|
Err(e) => {
|
|
// these stat failures are expected to be very rare
|
|
warn!("iteration failed, unexpected error: {e:#}");
|
|
}
|
|
}
|
|
}
|
|
.instrument(tracing::info_span!("iteration", iteration_no))
|
|
.await;
|
|
|
|
let sleep_until = start + task_config.period;
|
|
tokio::select! {
|
|
_ = tokio::time::sleep_until(sleep_until) => {},
|
|
_ = cancel.cancelled() => {
|
|
info!("shutting down");
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
pub trait Usage: Clone + Copy + std::fmt::Debug {
|
|
fn has_pressure(&self) -> bool;
|
|
fn add_available_bytes(&mut self, bytes: u64);
|
|
}
|
|
|
|
async fn disk_usage_eviction_task_iteration(
|
|
task_config: &DiskUsageEvictionTaskConfig,
|
|
storage: &GenericRemoteStorage,
|
|
tenants_dir_fd: &mut SyncWrapper<Dir>,
|
|
cancel: &CancellationToken,
|
|
) -> anyhow::Result<()> {
|
|
let usage_pre = filesystem_level_usage::get(tenants_dir_fd, task_config)
|
|
.context("get filesystem-level disk usage before evictions")?;
|
|
let res = disk_usage_eviction_task_iteration_impl(storage, usage_pre, cancel).await;
|
|
match res {
|
|
Ok(outcome) => {
|
|
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
|
match outcome {
|
|
IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
|
|
// nothing to do, select statement below will handle things
|
|
}
|
|
IterationOutcome::Finished(outcome) => {
|
|
// Verify with statvfs whether we made any real progress
|
|
let after = filesystem_level_usage::get(tenants_dir_fd, task_config)
|
|
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
|
|
.context("get filesystem-level disk usage after evictions")?;
|
|
|
|
debug!(?after, "disk usage");
|
|
|
|
if after.has_pressure() {
|
|
// Don't bother doing an out-of-order iteration here now.
|
|
// In practice, the task period is set to a value in the tens-of-seconds range,
|
|
// which will cause another iteration to happen soon enough.
|
|
// TODO: deltas between the three different usages would be helpful,
|
|
// consider MiB, GiB, TiB
|
|
warn!(?outcome, ?after, "disk usage still high");
|
|
} else {
|
|
info!(?outcome, ?after, "disk usage pressure relieved");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!("disk_usage_eviction_iteration failed: {:#}", e);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
#[allow(clippy::large_enum_variant)]
|
|
pub enum IterationOutcome<U> {
|
|
NoPressure,
|
|
Cancelled,
|
|
Finished(IterationOutcomeFinished<U>),
|
|
}
|
|
|
|
// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
|
|
// We use the Debug impl for logging, so, it's allright.
|
|
#[allow(dead_code)]
|
|
#[derive(Debug, Serialize)]
|
|
pub struct IterationOutcomeFinished<U> {
|
|
/// The actual usage observed before we started the iteration.
|
|
before: U,
|
|
/// The expected value for `after`, according to internal accounting, after phase 1.
|
|
planned: PlannedUsage<U>,
|
|
/// The outcome of phase 2, where we actually do the evictions.
|
|
///
|
|
/// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
|
|
/// be the same as `planned`.
|
|
assumed: AssumedUsage<U>,
|
|
}
|
|
|
|
// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
|
|
// We use the Debug impl for logging, so, it's allright.
|
|
#[derive(Debug, Serialize)]
|
|
#[allow(dead_code)]
|
|
struct AssumedUsage<U> {
|
|
/// The expected value for `after`, after phase 2.
|
|
projected_after: U,
|
|
/// The layers we failed to evict during phase 2.
|
|
failed: LayerCount,
|
|
}
|
|
|
|
// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
|
|
// We use the Debug impl for logging, so, it's allright.
|
|
#[allow(dead_code)]
|
|
#[derive(Debug, Serialize)]
|
|
struct PlannedUsage<U> {
|
|
respecting_tenant_min_resident_size: U,
|
|
fallback_to_global_lru: Option<U>,
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
#[derive(Debug, Default, Serialize)]
|
|
struct LayerCount {
|
|
file_sizes: u64,
|
|
count: usize,
|
|
}
|
|
|
|
#[allow(clippy::needless_late_init)]
|
|
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
|
storage: &GenericRemoteStorage,
|
|
usage_pre: U,
|
|
cancel: &CancellationToken,
|
|
) -> anyhow::Result<IterationOutcome<U>> {
|
|
static MUTEX: once_cell::sync::Lazy<tokio::sync::Mutex<()>> =
|
|
once_cell::sync::Lazy::new(|| tokio::sync::Mutex::new(()));
|
|
|
|
let _g = MUTEX
|
|
.try_lock()
|
|
.map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
|
|
|
|
// planned post-eviction usage
|
|
let mut usage_planned_min_resident_size_respecting = usage_pre;
|
|
let mut usage_planned_global_lru = None;
|
|
// achieved post-eviction usage according to internal accounting
|
|
let mut usage_assumed = usage_pre;
|
|
// actual usage read after batched evictions
|
|
|
|
debug!(?usage_pre, "disk usage");
|
|
|
|
if !usage_pre.has_pressure() {
|
|
return Ok(IterationOutcome::NoPressure);
|
|
}
|
|
|
|
warn!(
|
|
?usage_pre,
|
|
"running disk usage based eviction due to pressure"
|
|
);
|
|
|
|
let mut lru_candidates: Vec<(_, LocalLayerInfoForDiskUsageEviction)> = Vec::new();
|
|
|
|
// get a snapshot of the list of tenants
|
|
let tenants = tenant::mgr::list_tenants()
|
|
.await
|
|
.context("get list of tenants")?;
|
|
|
|
{
|
|
let mut tmp = Vec::new();
|
|
for (tenant_id, _state) in &tenants {
|
|
let flow = extend_lru_candidates(
|
|
Mode::RespectTenantMinResidentSize,
|
|
*tenant_id,
|
|
&mut lru_candidates,
|
|
&mut tmp,
|
|
cancel,
|
|
)
|
|
.await;
|
|
|
|
if let ControlFlow::Break(()) = flow {
|
|
return Ok(IterationOutcome::Cancelled);
|
|
}
|
|
|
|
assert!(tmp.is_empty(), "tmp has to be fully drained each iteration");
|
|
}
|
|
}
|
|
|
|
if cancel.is_cancelled() {
|
|
return Ok(IterationOutcome::Cancelled);
|
|
}
|
|
|
|
// phase1: select victims to relieve pressure
|
|
lru_candidates.sort_unstable_by_key(|(_, layer)| layer.last_activity_ts);
|
|
let mut batched: HashMap<_, Vec<LocalLayerInfoForDiskUsageEviction>> = HashMap::new();
|
|
for (i, (timeline, layer)) in lru_candidates.into_iter().enumerate() {
|
|
if !usage_planned_min_resident_size_respecting.has_pressure() {
|
|
debug!(
|
|
no_candidates_evicted = i,
|
|
"took enough candidates for pressure to be relieved"
|
|
);
|
|
break;
|
|
}
|
|
|
|
usage_planned_min_resident_size_respecting.add_available_bytes(layer.file_size());
|
|
|
|
batched
|
|
.entry(TimelineKey(timeline.clone()))
|
|
.or_default()
|
|
.push(layer);
|
|
}
|
|
// If we can't relieve pressure while respecting tenant_min_resident_size, fall back to global LRU.
|
|
if usage_planned_min_resident_size_respecting.has_pressure() {
|
|
// NB: tests depend on parts of this log message
|
|
warn!(?usage_pre, ?usage_planned_min_resident_size_respecting, "tenant_min_resident_size-respecting LRU would not relieve pressure, falling back to global LRU");
|
|
batched.clear();
|
|
let mut usage_planned = usage_pre;
|
|
let mut global_lru_candidates = Vec::new();
|
|
let mut tmp = Vec::new();
|
|
for (tenant_id, _state) in &tenants {
|
|
let flow = extend_lru_candidates(
|
|
Mode::GlobalLru,
|
|
*tenant_id,
|
|
&mut global_lru_candidates,
|
|
&mut tmp,
|
|
cancel,
|
|
)
|
|
.await;
|
|
|
|
if let ControlFlow::Break(()) = flow {
|
|
return Ok(IterationOutcome::Cancelled);
|
|
}
|
|
|
|
assert!(tmp.is_empty(), "tmp has to be fully drained each iteration");
|
|
}
|
|
global_lru_candidates.sort_unstable_by_key(|(_, layer)| layer.last_activity_ts);
|
|
for (timeline, layer) in global_lru_candidates {
|
|
usage_planned.add_available_bytes(layer.file_size());
|
|
batched
|
|
.entry(TimelineKey(timeline.clone()))
|
|
.or_default()
|
|
.push(layer);
|
|
if cancel.is_cancelled() {
|
|
return Ok(IterationOutcome::Cancelled);
|
|
}
|
|
}
|
|
usage_planned_global_lru = Some(usage_planned);
|
|
}
|
|
let usage_planned = PlannedUsage {
|
|
respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting,
|
|
fallback_to_global_lru: usage_planned_global_lru,
|
|
};
|
|
|
|
debug!(?usage_planned, "usage planned");
|
|
|
|
// phase2: evict victims batched by timeline
|
|
let mut batch = Vec::new();
|
|
let mut evictions_failed = LayerCount::default();
|
|
for (timeline, layers) in batched {
|
|
let tenant_id = timeline.tenant_id;
|
|
let timeline_id = timeline.timeline_id;
|
|
|
|
batch.clear();
|
|
batch.extend(layers.iter().map(|x| &x.layer).cloned());
|
|
let batch_size = batch.len();
|
|
|
|
debug!(%timeline_id, "evicting batch for timeline");
|
|
|
|
async {
|
|
let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
|
|
|
|
match results {
|
|
Err(e) => {
|
|
warn!("failed to evict batch: {:#}", e);
|
|
}
|
|
Ok(results) => {
|
|
assert_eq!(results.len(), layers.len());
|
|
for (result, layer) in results.into_iter().zip(layers.iter()) {
|
|
match result {
|
|
Some(Ok(true)) => {
|
|
usage_assumed.add_available_bytes(layer.file_size());
|
|
}
|
|
Some(Ok(false)) => {
|
|
// this is:
|
|
// - Replacement::{NotFound, Unexpected}
|
|
// - it cannot be is_remote_layer, filtered already
|
|
evictions_failed.file_sizes += layer.file_size();
|
|
evictions_failed.count += 1;
|
|
}
|
|
None => {
|
|
assert!(cancel.is_cancelled());
|
|
return;
|
|
}
|
|
Some(Err(e)) => {
|
|
// we really shouldn't be getting this, precondition failure
|
|
error!("failed to evict layer: {:#}", e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
|
|
.await;
|
|
|
|
if cancel.is_cancelled() {
|
|
return Ok(IterationOutcome::Cancelled);
|
|
}
|
|
}
|
|
|
|
Ok(IterationOutcome::Finished(IterationOutcomeFinished {
|
|
before: usage_pre,
|
|
planned: usage_planned,
|
|
assumed: AssumedUsage {
|
|
projected_after: usage_assumed,
|
|
failed: evictions_failed,
|
|
},
|
|
}))
|
|
}
|
|
|
|
/// Different modes of gathering tenant's least recently used layers.
|
|
#[derive(Debug)]
|
|
enum Mode {
|
|
/// Add all but the most recently used `min_resident_size` worth of layers to the candidates
|
|
/// list.
|
|
///
|
|
/// `min_resident_size` defaults to maximum layer file size of the tenant. This ensures that
|
|
/// the tenant will always have one layer resident. If we cannot compute `min_resident_size`
|
|
/// accurately because metadata is missing we use hardcoded constant. `min_resident_size` can
|
|
/// be overridden per tenant for important tenants.
|
|
RespectTenantMinResidentSize,
|
|
/// Consider all layer files from all tenants in LRU order.
|
|
///
|
|
/// This is done if the `min_resident_size` respecting does not relieve pressure.
|
|
GlobalLru,
|
|
}
|
|
|
|
#[instrument(skip_all, fields(?mode, %tenant_id))]
|
|
async fn extend_lru_candidates(
|
|
mode: Mode,
|
|
tenant_id: TenantId,
|
|
lru_candidates: &mut Vec<(Arc<Timeline>, LocalLayerInfoForDiskUsageEviction)>,
|
|
scratch: &mut Vec<(Arc<Timeline>, LocalLayerInfoForDiskUsageEviction)>,
|
|
cancel: &CancellationToken,
|
|
) -> ControlFlow<()> {
|
|
debug!("begin");
|
|
|
|
let tenant = match tenant::mgr::get_tenant(tenant_id, true).await {
|
|
Ok(tenant) => tenant,
|
|
Err(e) => {
|
|
// this can happen if tenant has lifecycle transition after we fetched it
|
|
debug!("failed to get tenant: {e:#}");
|
|
return ControlFlow::Continue(());
|
|
}
|
|
};
|
|
|
|
if cancel.is_cancelled() {
|
|
return ControlFlow::Break(());
|
|
}
|
|
|
|
let mut max_layer_size = ApproxAccurate::default();
|
|
for tl in tenant.list_timelines() {
|
|
if !tl.is_active() {
|
|
continue;
|
|
}
|
|
let info = tl.get_local_layers_for_disk_usage_eviction();
|
|
debug!(timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
|
scratch.extend(
|
|
info.resident_layers
|
|
.into_iter()
|
|
.map(|layer_infos| (tl.clone(), layer_infos)),
|
|
);
|
|
max_layer_size = max_layer_size.max(info.max_layer_size.accurate());
|
|
|
|
if cancel.is_cancelled() {
|
|
return ControlFlow::Break(());
|
|
}
|
|
}
|
|
|
|
let min_resident_size = match mode {
|
|
Mode::GlobalLru => {
|
|
lru_candidates.append(scratch);
|
|
return ControlFlow::Continue(());
|
|
}
|
|
Mode::RespectTenantMinResidentSize => match tenant.get_min_resident_size_override() {
|
|
Some(size) => size,
|
|
None => {
|
|
match max_layer_size.accurate() {
|
|
Some(size) => size,
|
|
None => {
|
|
let prod_max_layer_file_size = 332_880_000;
|
|
// rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers
|
|
static LAST_WARNED: Mutex<Option<Instant>> = Mutex::new(None);
|
|
let mut last_warned = LAST_WARNED.lock().unwrap();
|
|
if last_warned
|
|
.map(|v| v.elapsed() > Duration::from_secs(60))
|
|
.unwrap_or(true)
|
|
{
|
|
warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value");
|
|
*last_warned = Some(Instant::now());
|
|
}
|
|
prod_max_layer_file_size
|
|
}
|
|
}
|
|
}
|
|
},
|
|
};
|
|
|
|
scratch.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts);
|
|
|
|
let mut current: u64 = scratch.iter().map(|(_, layer)| layer.file_size()).sum();
|
|
for (tl, layer) in scratch.drain(..) {
|
|
if cancel.is_cancelled() {
|
|
return ControlFlow::Break(());
|
|
}
|
|
if current <= min_resident_size {
|
|
break;
|
|
}
|
|
current -= layer.file_size();
|
|
debug!(?layer, "adding layer to lru_candidates");
|
|
lru_candidates.push((tl, layer));
|
|
}
|
|
|
|
ControlFlow::Continue(())
|
|
}
|
|
|
|
struct TimelineKey(Arc<Timeline>);
|
|
|
|
impl PartialEq for TimelineKey {
|
|
fn eq(&self, other: &Self) -> bool {
|
|
Arc::ptr_eq(&self.0, &other.0)
|
|
}
|
|
}
|
|
|
|
impl Eq for TimelineKey {}
|
|
|
|
impl std::hash::Hash for TimelineKey {
|
|
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
|
Arc::as_ptr(&self.0).hash(state);
|
|
}
|
|
}
|
|
|
|
impl std::ops::Deref for TimelineKey {
|
|
type Target = Timeline;
|
|
|
|
fn deref(&self) -> &Self::Target {
|
|
self.0.as_ref()
|
|
}
|
|
}
|
|
|
|
mod filesystem_level_usage {
|
|
use anyhow::Context;
|
|
use nix::{
|
|
dir::Dir,
|
|
sys::statvfs::{self, Statvfs},
|
|
};
|
|
use sync_wrapper::SyncWrapper;
|
|
|
|
use super::DiskUsageEvictionTaskConfig;
|
|
|
|
// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
|
|
// We use the Debug impl for logging, so, it's allright.
|
|
#[derive(Debug, Clone, Copy)]
|
|
#[allow(dead_code)]
|
|
pub struct Usage<'a> {
|
|
config: &'a DiskUsageEvictionTaskConfig,
|
|
|
|
/// Filesystem capacity
|
|
total_bytes: u64,
|
|
/// Free filesystem space
|
|
avail_bytes: u64,
|
|
}
|
|
|
|
impl super::Usage for Usage<'_> {
|
|
fn has_pressure(&self) -> bool {
|
|
let usage_pct =
|
|
(100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
|
|
|
|
let pressures = [
|
|
(
|
|
"min_avail_bytes",
|
|
self.avail_bytes < self.config.min_avail_bytes,
|
|
),
|
|
(
|
|
"max_usage_pct",
|
|
usage_pct > self.config.max_usage_pct.get() as u64,
|
|
),
|
|
];
|
|
|
|
pressures.into_iter().any(|(_, has_pressure)| has_pressure)
|
|
}
|
|
|
|
fn add_available_bytes(&mut self, bytes: u64) {
|
|
self.avail_bytes += bytes;
|
|
}
|
|
}
|
|
|
|
pub fn get<'a>(
|
|
tenants_dir_fd: &mut SyncWrapper<Dir>,
|
|
config: &'a DiskUsageEvictionTaskConfig,
|
|
) -> anyhow::Result<Usage<'a>> {
|
|
let stat: Statvfs = statvfs::fstatvfs(tenants_dir_fd.get_mut())
|
|
.context("statvfs failed, presumably directory got unlinked")?;
|
|
|
|
// https://unix.stackexchange.com/a/703650
|
|
let blocksize = if stat.fragment_size() > 0 {
|
|
stat.fragment_size()
|
|
} else {
|
|
stat.block_size()
|
|
};
|
|
|
|
// use blocks_available (b_avail) since, pageserver runs as unprivileged user
|
|
let avail_bytes = stat.blocks_available() * blocksize;
|
|
let total_bytes = stat.blocks() * blocksize;
|
|
|
|
Ok(Usage {
|
|
config,
|
|
total_bytes,
|
|
avail_bytes,
|
|
})
|
|
}
|
|
}
|