diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index b537795704..e8d0bb1dc7 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,6 +8,14 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + # TODO: learn typical resident-size growth rate [GiB/minute] and configure + # min_avail_bytes such that we have X minutes of headroom. + min_avail_bytes: 0 + # We assume that the worst-case growth rate is small enough that we can + # catch above-threshold conditions by checking every 10s. + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index cd8f832af0..4ef51651fc 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,6 +8,14 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + # TODO: learn typical resident-size growth rate [GiB/minute] and configure + # min_avail_bytes such that we have X minutes of headroom. + min_avail_bytes: 0 + # We assume that the worst-case growth rate is small enough that we can + # catch above-threshold conditions by checking every 10s. + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs index 386f34460e..63b62b5f1e 100644 --- a/libs/utils/src/serde_percent.rs +++ b/libs/utils/src/serde_percent.rs @@ -1,4 +1,4 @@ -//! A serde::Desierialize type for percentages. +//! A serde::Deserialize type for percentages. //! //! See [`Percent`] for details. diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index f2f318adda..735b7124f1 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,7 +8,7 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; -use pageserver::disk_usage_eviction_task::launch_disk_usage_global_eviction_task; +use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use remote_storage::GenericRemoteStorage; use tracing::*; @@ -320,8 +320,18 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; + // shared state between the disk-usage backed eviction background task and the http endpoint + // that allows triggering disk-usage based eviction manually. note that the http endpoint + // is still accessible even if background task is not configured as long as remote storage has + // been configured. + let disk_usage_eviction_state: Arc = Arc::default(); + if let Some(remote_storage) = &remote_storage { - launch_disk_usage_global_eviction_task(conf, remote_storage.clone())?; + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + )?; } // Start up the service to handle HTTP mgmt API request. We created the @@ -329,9 +339,15 @@ fn start_pageserver( { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, launch_ts, http_auth, remote_storage)? - .build() - .map_err(|err| anyhow!(err))?; + let router = http::make_router( + conf, + launch_ts, + http_auth, + remote_storage, + disk_usage_eviction_state, + )? + .build() + .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 97f248845d..fbad2e2426 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -1,5 +1,7 @@ //! This module implements the pageserver-global disk-usage-based layer eviction task. //! +//! # Mechanics +//! //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background //! loop that evicts layers in response to a shortage of available bytes //! in the $repo/tenants directory's filesystem. @@ -13,6 +15,8 @@ //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds. //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. //! +//! # Eviction Policy +//! //! There are two thresholds: //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. //! If the actual usage is higher, the threshold is exceeded. @@ -69,9 +73,16 @@ pub struct DiskUsageEvictionTaskConfig { pub period: Duration, } +#[derive(Default)] +pub struct State { + /// Exclude http requests and background task from running at the same time. + mutex: tokio::sync::Mutex<()>, +} + pub fn launch_disk_usage_global_eviction_task( conf: &'static PageServerConf, storage: GenericRemoteStorage, + state: Arc, ) -> anyhow::Result<()> { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); @@ -99,6 +110,7 @@ pub fn launch_disk_usage_global_eviction_task( false, async move { disk_usage_eviction_task( + &state, task_config, storage, tenants_dir_fd, @@ -115,6 +127,7 @@ pub fn launch_disk_usage_global_eviction_task( #[instrument(skip_all)] async fn disk_usage_eviction_task( + state: &State, task_config: &DiskUsageEvictionTaskConfig, storage: GenericRemoteStorage, tenants_dir_fd: Dir, @@ -146,6 +159,7 @@ async fn disk_usage_eviction_task( async { let res = disk_usage_eviction_task_iteration( + state, task_config, &storage, &mut tenants_dir_fd, @@ -181,6 +195,7 @@ pub trait Usage: Clone + Copy + std::fmt::Debug { } async fn disk_usage_eviction_task_iteration( + state: &State, task_config: &DiskUsageEvictionTaskConfig, storage: &GenericRemoteStorage, tenants_dir_fd: &mut SyncWrapper, @@ -188,7 +203,7 @@ async fn disk_usage_eviction_task_iteration( ) -> anyhow::Result<()> { let usage_pre = filesystem_level_usage::get(tenants_dir_fd, task_config) .context("get filesystem-level disk usage before evictions")?; - let res = disk_usage_eviction_task_iteration_impl(storage, usage_pre, cancel).await; + let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; match res { Ok(outcome) => { debug!(?outcome, "disk_usage_eviction_iteration finished"); @@ -272,14 +287,13 @@ struct LayerCount { #[allow(clippy::needless_late_init)] pub async fn disk_usage_eviction_task_iteration_impl( + state: &State, storage: &GenericRemoteStorage, usage_pre: U, cancel: &CancellationToken, ) -> anyhow::Result> { - static MUTEX: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| tokio::sync::Mutex::new(())); - - let _g = MUTEX + let _g = state + .mutex .try_lock() .map_err(|_| anyhow::anyhow!("iteration is already executing"))?; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 49be55f9e9..6699f6e5d4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,6 +18,7 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; @@ -48,6 +49,7 @@ struct State { auth: Option>, allowlist_routes: Vec, remote_storage: Option, + disk_usage_eviction_state: Arc, } impl State { @@ -55,6 +57,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -65,6 +68,7 @@ impl State { auth, allowlist_routes, remote_storage, + disk_usage_eviction_state, }) } } @@ -1124,6 +1128,8 @@ async fn disk_usage_eviction_run(mut r: Request) -> Result, ))) }; + let state = state.disk_usage_eviction_state.clone(); + let cancel = CancellationToken::new(); let child_cancel = cancel.clone(); let _g = cancel.drop_guard(); @@ -1137,6 +1143,7 @@ async fn disk_usage_eviction_run(mut r: Request) -> Result, false, async move { let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( + &state, &storage, usage, &child_cancel, @@ -1168,6 +1175,7 @@ pub fn make_router( launch_ts: &'static LaunchTimestamp, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -1212,7 +1220,8 @@ pub fn make_router( Ok(router .data(Arc::new( - State::new(conf, auth, remote_storage).context("Failed to initialize router state")?, + State::new(conf, auth, remote_storage, disk_usage_eviction_state) + .context("Failed to initialize router state")?, )) .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) .put(