From 0a5043fae571d47dfc4fd9e89b9bb5c4d6fd9dd3 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 27 Mar 2023 20:02:41 +0300 Subject: [PATCH 01/16] refactor: less static mutexes --- pageserver/src/bin/pageserver.rs | 27 ++++++++++++++++++---- pageserver/src/disk_usage_eviction_task.rs | 20 ++++++++++++---- pageserver/src/http/routes.rs | 11 ++++++++- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index f2f318adda..1400ef0e31 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,7 +8,9 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; -use pageserver::disk_usage_eviction_task::launch_disk_usage_global_eviction_task; +use pageserver::disk_usage_eviction_task::{ + launch_disk_usage_global_eviction_task, DiskUsageEvictionState, +}; use remote_storage::GenericRemoteStorage; use tracing::*; @@ -320,8 +322,17 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; + // shared state between the background task and the http endpoint; note that the http endpoint + // is still accessible even if background task is not configured as long as remote storage has + // been configured. + let disk_usage_eviction_state: Arc = Arc::default(); + if let Some(remote_storage) = &remote_storage { - launch_disk_usage_global_eviction_task(conf, remote_storage.clone())?; + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + )?; } // Start up the service to handle HTTP mgmt API request. We created the @@ -329,9 +340,15 @@ fn start_pageserver( { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, launch_ts, http_auth, remote_storage)? - .build() - .map_err(|err| anyhow!(err))?; + let router = http::make_router( + conf, + launch_ts, + http_auth, + remote_storage, + disk_usage_eviction_state, + )? + .build() + .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index a7ab5b8db7..38aeaf275a 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -70,9 +70,16 @@ pub struct DiskUsageEvictionTaskConfig { pub period: Duration, } +#[derive(Default)] +pub struct DiskUsageEvictionState { + /// Exclude http requests and background task from running at the same time. + mutex: tokio::sync::Mutex<()>, +} + pub fn launch_disk_usage_global_eviction_task( conf: &'static PageServerConf, storage: GenericRemoteStorage, + state: Arc, ) -> anyhow::Result<()> { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); @@ -100,6 +107,7 @@ pub fn launch_disk_usage_global_eviction_task( false, async move { disk_usage_eviction_task( + &*state, task_config, storage, tenants_dir_fd, @@ -116,6 +124,7 @@ pub fn launch_disk_usage_global_eviction_task( #[instrument(skip_all)] async fn disk_usage_eviction_task( + state: &DiskUsageEvictionState, task_config: &DiskUsageEvictionTaskConfig, storage: GenericRemoteStorage, tenants_dir_fd: Dir, @@ -147,6 +156,7 @@ async fn disk_usage_eviction_task( async { let res = disk_usage_eviction_task_iteration( + state, task_config, &storage, &mut tenants_dir_fd, @@ -182,6 +192,7 @@ pub trait Usage: Clone + Copy + std::fmt::Debug { } async fn disk_usage_eviction_task_iteration( + state: &DiskUsageEvictionState, task_config: &DiskUsageEvictionTaskConfig, storage: &GenericRemoteStorage, tenants_dir_fd: &mut SyncWrapper, @@ -189,7 +200,7 @@ async fn disk_usage_eviction_task_iteration( ) -> anyhow::Result<()> { let usage_pre = filesystem_level_usage::get(tenants_dir_fd, task_config) .context("get filesystem-level disk usage before evictions")?; - let res = disk_usage_eviction_task_iteration_impl(storage, usage_pre, cancel).await; + let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; match res { Ok(outcome) => { debug!(?outcome, "disk_usage_eviction_iteration finished"); @@ -273,14 +284,13 @@ struct LayerCount { #[allow(clippy::needless_late_init)] pub async fn disk_usage_eviction_task_iteration_impl( + state: &DiskUsageEvictionState, storage: &GenericRemoteStorage, usage_pre: U, cancel: &CancellationToken, ) -> anyhow::Result> { - static MUTEX: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| tokio::sync::Mutex::new(())); - - let _g = MUTEX + let _g = state + .mutex .try_lock() .map_err(|_| anyhow::anyhow!("iteration is already executing"))?; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 49be55f9e9..d0ed97fefe 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,6 +18,7 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task::DiskUsageEvictionState; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; @@ -48,6 +49,7 @@ struct State { auth: Option>, allowlist_routes: Vec, remote_storage: Option, + disk_usage_eviction_state: Arc, } impl State { @@ -55,6 +57,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -65,6 +68,7 @@ impl State { auth, allowlist_routes, remote_storage, + disk_usage_eviction_state, }) } } @@ -1124,6 +1128,8 @@ async fn disk_usage_eviction_run(mut r: Request) -> Result, ))) }; + let state = state.disk_usage_eviction_state.clone(); + let cancel = CancellationToken::new(); let child_cancel = cancel.clone(); let _g = cancel.drop_guard(); @@ -1137,6 +1143,7 @@ async fn disk_usage_eviction_run(mut r: Request) -> Result, false, async move { let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( + &state, &storage, usage, &child_cancel, @@ -1168,6 +1175,7 @@ pub fn make_router( launch_ts: &'static LaunchTimestamp, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -1212,7 +1220,8 @@ pub fn make_router( Ok(router .data(Arc::new( - State::new(conf, auth, remote_storage).context("Failed to initialize router state")?, + State::new(conf, auth, remote_storage, disk_usage_eviction_state) + .context("Failed to initialize router state")?, )) .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) .put( From 244185e6e633bf7a694089c99b2077e24f3bdb0a Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 28 Mar 2023 10:31:54 +0300 Subject: [PATCH 02/16] doc: comment changes Co-authored-by: Christian Schwarz --- pageserver/src/bin/pageserver.rs | 3 ++- pageserver/src/disk_usage_eviction_task.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 1400ef0e31..7dcf6465e2 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -322,7 +322,8 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; - // shared state between the background task and the http endpoint; note that the http endpoint + // shared state between the disk-usage backed eviction background task and the http endpoint + // that allows triggering disk-usage based eviction manually. note that the http endpoint // is still accessible even if background task is not configured as long as remote storage has // been configured. let disk_usage_eviction_state: Arc = Arc::default(); diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 38aeaf275a..e0083c11bb 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -72,7 +72,7 @@ pub struct DiskUsageEvictionTaskConfig { #[derive(Default)] pub struct DiskUsageEvictionState { - /// Exclude http requests and background task from running at the same time. + /// Prevent mgmt API requests and the background loop from running eviction at the same time. mutex: tokio::sync::Mutex<()>, } From b59975504299f82606da1174632a954177d3cd4f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 28 Mar 2023 10:30:36 +0300 Subject: [PATCH 03/16] refactor: rename DiskUsageEvictionState => State --- pageserver/src/bin/pageserver.rs | 6 ++---- pageserver/src/disk_usage_eviction_task.rs | 12 ++++++------ pageserver/src/http/routes.rs | 8 ++++---- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 7dcf6465e2..735b7124f1 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,9 +8,7 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; -use pageserver::disk_usage_eviction_task::{ - launch_disk_usage_global_eviction_task, DiskUsageEvictionState, -}; +use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use remote_storage::GenericRemoteStorage; use tracing::*; @@ -326,7 +324,7 @@ fn start_pageserver( // that allows triggering disk-usage based eviction manually. note that the http endpoint // is still accessible even if background task is not configured as long as remote storage has // been configured. - let disk_usage_eviction_state: Arc = Arc::default(); + let disk_usage_eviction_state: Arc = Arc::default(); if let Some(remote_storage) = &remote_storage { launch_disk_usage_global_eviction_task( diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index e0083c11bb..cbaae72133 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -71,15 +71,15 @@ pub struct DiskUsageEvictionTaskConfig { } #[derive(Default)] -pub struct DiskUsageEvictionState { - /// Prevent mgmt API requests and the background loop from running eviction at the same time. +pub struct State { + /// Exclude http requests and background task from running at the same time. mutex: tokio::sync::Mutex<()>, } pub fn launch_disk_usage_global_eviction_task( conf: &'static PageServerConf, storage: GenericRemoteStorage, - state: Arc, + state: Arc, ) -> anyhow::Result<()> { let Some(task_config) = &conf.disk_usage_based_eviction else { info!("disk usage based eviction task not configured"); @@ -124,7 +124,7 @@ pub fn launch_disk_usage_global_eviction_task( #[instrument(skip_all)] async fn disk_usage_eviction_task( - state: &DiskUsageEvictionState, + state: &State, task_config: &DiskUsageEvictionTaskConfig, storage: GenericRemoteStorage, tenants_dir_fd: Dir, @@ -192,7 +192,7 @@ pub trait Usage: Clone + Copy + std::fmt::Debug { } async fn disk_usage_eviction_task_iteration( - state: &DiskUsageEvictionState, + state: &State, task_config: &DiskUsageEvictionTaskConfig, storage: &GenericRemoteStorage, tenants_dir_fd: &mut SyncWrapper, @@ -284,7 +284,7 @@ struct LayerCount { #[allow(clippy::needless_late_init)] pub async fn disk_usage_eviction_task_iteration_impl( - state: &DiskUsageEvictionState, + state: &State, storage: &GenericRemoteStorage, usage_pre: U, cancel: &CancellationToken, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index d0ed97fefe..6699f6e5d4 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,7 +18,7 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; -use crate::disk_usage_eviction_task::DiskUsageEvictionState; +use crate::disk_usage_eviction_task; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; @@ -49,7 +49,7 @@ struct State { auth: Option>, allowlist_routes: Vec, remote_storage: Option, - disk_usage_eviction_state: Arc, + disk_usage_eviction_state: Arc, } impl State { @@ -57,7 +57,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_storage: Option, - disk_usage_eviction_state: Arc, + disk_usage_eviction_state: Arc, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -1175,7 +1175,7 @@ pub fn make_router( launch_ts: &'static LaunchTimestamp, auth: Option>, remote_storage: Option, - disk_usage_eviction_state: Arc, + disk_usage_eviction_state: Arc, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); From 0943dd30eb3593bdc4c98ce53223c65d6529c607 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 28 Mar 2023 10:34:21 +0300 Subject: [PATCH 04/16] chore: clippy --- pageserver/src/disk_usage_eviction_task.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index cbaae72133..da02eb3451 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -107,7 +107,7 @@ pub fn launch_disk_usage_global_eviction_task( false, async move { disk_usage_eviction_task( - &*state, + &state, task_config, storage, tenants_dir_fd, From 17b5c8d1c446547ce08e7c32d6f5393e7de38310 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 28 Mar 2023 11:26:45 +0300 Subject: [PATCH 05/16] refactor: get rid of ApproxAccurate --- libs/utils/src/approx_accurate.rs | 100 --------------------- libs/utils/src/lib.rs | 1 - pageserver/src/disk_usage_eviction_task.rs | 48 +++++----- pageserver/src/tenant/timeline.rs | 8 +- 4 files changed, 31 insertions(+), 126 deletions(-) delete mode 100644 libs/utils/src/approx_accurate.rs diff --git a/libs/utils/src/approx_accurate.rs b/libs/utils/src/approx_accurate.rs deleted file mode 100644 index e7af36eec4..0000000000 --- a/libs/utils/src/approx_accurate.rs +++ /dev/null @@ -1,100 +0,0 @@ -/// Three-state `max` accumulator. -/// -/// If it accumulates over 0 or many `Some(T)` values, it is `Accurate` maximum of those values. -/// If a single `None` value is merged, it becomes `Approximate` variant. -/// -/// Remove when `Layer::file_size` is no longer an `Option`. -#[derive(Default, Debug, Clone, Copy)] -pub enum ApproxAccurate { - Approximate(T), - Accurate(T), - #[default] - Empty, -} - -impl ApproxAccurate { - /// `max(a, b)` where the approximate is inflicted receiving a `None`, or infected onwards. - #[must_use] - pub fn max(self, next: Option) -> ApproxAccurate { - use ApproxAccurate::*; - match (self, next) { - (Accurate(a) | Approximate(a), None) => Approximate(a), - (Empty, None) => Approximate(T::default()), - (Accurate(a), Some(b)) => Accurate(a.max(b)), - (Approximate(a), Some(b)) => Approximate(a.max(b)), - (Empty, Some(b)) => Accurate(b), - } - } - - pub fn is_approximate(&self) -> bool { - matches!(self, ApproxAccurate::Approximate(_)) - } - - pub fn accurate(self) -> Option { - use ApproxAccurate::*; - match self { - Accurate(a) => Some(a), - Empty => Some(T::default()), - Approximate(_) => None, - } - } - - pub fn unwrap_accurate_or(self, default: T) -> T { - use ApproxAccurate::*; - match self { - Accurate(a) => a, - Approximate(_) => default, - // Empty is still accurate, just special case for above `max` - Empty => T::default(), - } - } -} - -#[cfg(test)] -mod tests { - use super::ApproxAccurate; - - #[test] - fn accumulate_only_some() { - let acc = (0..=5) - .into_iter() - .map(Some) - .fold(ApproxAccurate::default(), |acc, next| acc.max(next)); - - assert_eq!(acc.accurate(), Some(5)); - assert!(!acc.is_approximate()); - assert_eq!(acc.unwrap_accurate_or(42), 5); - } - - #[test] - fn accumulate_some_and_none() { - let acc = [Some(0), None, Some(2)] - .into_iter() - .fold(ApproxAccurate::default(), |acc, next| acc.max(next)); - - assert_eq!(acc.accurate(), None); - assert!(acc.is_approximate()); - assert_eq!(acc.unwrap_accurate_or(42), 42); - } - - #[test] - fn accumulate_none_and_some() { - let acc = [None, Some(1), None] - .into_iter() - .fold(ApproxAccurate::default(), |acc, next| acc.max(next)); - - assert_eq!(acc.accurate(), None); - assert!(acc.is_approximate()); - assert_eq!(acc.unwrap_accurate_or(42), 42); - } - - #[test] - fn accumulate_none() { - let acc = ApproxAccurate::::default(); - - // it is accurate empty - assert_eq!(acc.accurate(), Some(0)); - assert!(!acc.is_approximate()); - assert_eq!(acc.unwrap_accurate_or(42), 0); - } -} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 6435b18d4b..8f572f0ec3 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -33,7 +33,6 @@ pub mod pid_file; // Misc pub mod accum; -pub mod approx_accurate; pub mod shutdown; // Utility for binding TcpListeners with proper socket options. diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index da02eb3451..2142124fb1 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -39,12 +39,7 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{ - collections::HashMap, - ops::ControlFlow, - sync::{Arc, Mutex}, - time::Duration, -}; +use std::{collections::HashMap, ops::ControlFlow, sync::Arc, time::Duration}; use anyhow::Context; use nix::dir::Dir; @@ -54,7 +49,7 @@ use sync_wrapper::SyncWrapper; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::{approx_accurate::ApproxAccurate, id::TenantId, serde_percent::Percent}; +use utils::{id::TenantId, serde_percent::Percent}; use crate::{ config::PageServerConf, @@ -511,7 +506,7 @@ async fn extend_lru_candidates( return ControlFlow::Break(()); } - let mut max_layer_size = ApproxAccurate::default(); + let mut max_layer_size: Option = None; for tl in tenant.list_timelines() { if !tl.is_active() { continue; @@ -523,7 +518,11 @@ async fn extend_lru_candidates( .into_iter() .map(|layer_infos| (tl.clone(), layer_infos)), ); - max_layer_size = max_layer_size.max(info.max_layer_size.accurate()); + max_layer_size = match (max_layer_size, info.max_layer_size) { + (Some(x), Some(y)) => Some(x.max(y)), + (Some(only), None) | (None, Some(only)) => Some(only), + (None, None) => None, + }; if cancel.is_cancelled() { return ControlFlow::Break(()); @@ -538,21 +537,28 @@ async fn extend_lru_candidates( Mode::RespectTenantMinResidentSize => match tenant.get_min_resident_size_override() { Some(size) => size, None => { - match max_layer_size.accurate() { + match max_layer_size { Some(size) => size, None => { - let prod_max_layer_file_size = 332_880_000; - // rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers - static LAST_WARNED: Mutex> = Mutex::new(None); - let mut last_warned = LAST_WARNED.lock().unwrap(); - if last_warned - .map(|v| v.elapsed() > Duration::from_secs(60)) - .unwrap_or(true) - { - warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value"); - *last_warned = Some(Instant::now()); + if !scratch.is_empty() { + // soft assert + warn!("BUG: no maximum layer size, but still found layers"); + scratch.clear(); } - prod_max_layer_file_size + return ControlFlow::Continue(()); + + // let prod_max_layer_file_size = 332_880_000; + // // rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers + // static LAST_WARNED: Mutex> = Mutex::new(None); + // let mut last_warned = LAST_WARNED.lock().unwrap(); + // if last_warned + // .map(|v| v.elapsed() > Duration::from_secs(60)) + // .unwrap_or(true) + // { + // warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value"); + // *last_warned = Some(Instant::now()); + // } + // prod_max_layer_file_size } } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2433567316..91efde8f22 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -56,7 +56,6 @@ use pageserver_api::reltag::RelTag; use postgres_connection::PgConnectionConfig; use postgres_ffi::to_pg_timestamp; use utils::{ - approx_accurate::ApproxAccurate, id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, @@ -4040,7 +4039,7 @@ impl Timeline { pub struct DiskUsageEvictionInfo { /// Timeline's largest layer (remote or resident) - pub max_layer_size: ApproxAccurate, + pub max_layer_size: Option, /// Timeline's resident layers pub resident_layers: Vec, } @@ -4073,11 +4072,12 @@ impl Timeline { pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let layers = self.layers.read().unwrap(); - let mut max_layer_size = ApproxAccurate::default(); + let mut max_layer_size: Option = None; let mut resident_layers = Vec::new(); for l in layers.iter_historic_layers() { - max_layer_size = max_layer_size.max(Some(l.file_size())); + let file_size = l.file_size(); + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); if l.is_remote_layer() { continue; From 6e8d7b449f0adb25cfbf0b68270e8e1f1662a71e Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 28 Mar 2023 11:28:52 +0300 Subject: [PATCH 06/16] refactor: combine nested macthes --- pageserver/src/disk_usage_eviction_task.rs | 49 ++++++++++------------ 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 2142124fb1..381494f6c8 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -534,35 +534,32 @@ async fn extend_lru_candidates( lru_candidates.append(scratch); return ControlFlow::Continue(()); } - Mode::RespectTenantMinResidentSize => match tenant.get_min_resident_size_override() { - Some(size) => size, - None => { - match max_layer_size { - Some(size) => size, - None => { - if !scratch.is_empty() { - // soft assert - warn!("BUG: no maximum layer size, but still found layers"); - scratch.clear(); - } - return ControlFlow::Continue(()); - - // let prod_max_layer_file_size = 332_880_000; - // // rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers - // static LAST_WARNED: Mutex> = Mutex::new(None); - // let mut last_warned = LAST_WARNED.lock().unwrap(); - // if last_warned - // .map(|v| v.elapsed() > Duration::from_secs(60)) - // .unwrap_or(true) - // { - // warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value"); - // *last_warned = Some(Instant::now()); - // } - // prod_max_layer_file_size + Mode::RespectTenantMinResidentSize => { + match tenant.get_min_resident_size_override().or(max_layer_size) { + Some(size) => size, + None => { + if !scratch.is_empty() { + // soft assert + warn!("BUG: no maximum layer size, but still found layers"); + scratch.clear(); } + return ControlFlow::Continue(()); + + // let prod_max_layer_file_size = 332_880_000; + // // rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers + // static LAST_WARNED: Mutex> = Mutex::new(None); + // let mut last_warned = LAST_WARNED.lock().unwrap(); + // if last_warned + // .map(|v| v.elapsed() > Duration::from_secs(60)) + // .unwrap_or(true) + // { + // warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value"); + // *last_warned = Some(Instant::now()); + // } + // prod_max_layer_file_size } } - }, + } }; scratch.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts); From 03ab5df081d7f5f458175cf5a991e46166d9d918 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 28 Mar 2023 11:30:36 +0300 Subject: [PATCH 07/16] chore: remove dead code --- pageserver/src/disk_usage_eviction_task.rs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 381494f6c8..b4f7e3d50a 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -544,19 +544,6 @@ async fn extend_lru_candidates( scratch.clear(); } return ControlFlow::Continue(()); - - // let prod_max_layer_file_size = 332_880_000; - // // rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers - // static LAST_WARNED: Mutex> = Mutex::new(None); - // let mut last_warned = LAST_WARNED.lock().unwrap(); - // if last_warned - // .map(|v| v.elapsed() > Duration::from_secs(60)) - // .unwrap_or(true) - // { - // warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value"); - // *last_warned = Some(Instant::now()); - // } - // prod_max_layer_file_size } } } From 75759f709f139f8e74abba31b9746702d5086eaf Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 28 Mar 2023 11:47:12 +0300 Subject: [PATCH 08/16] doc: explain "bug" message, log layers --- pageserver/src/disk_usage_eviction_task.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index b4f7e3d50a..c0ee42066b 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -538,9 +538,14 @@ async fn extend_lru_candidates( match tenant.get_min_resident_size_override().or(max_layer_size) { Some(size) => size, None => { + // the tenant has no layers at all. it's very unlikely but allowed by the + // types. if !scratch.is_empty() { // soft assert - warn!("BUG: no maximum layer size, but still found layers"); + warn!( + layers = scratch.len(), + "BUG: no maximum layer size, but still found layers" + ); scratch.clear(); } return ControlFlow::Continue(()); From 38d306114392264918a4af8d863391879eabcb47 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 11:32:21 +0200 Subject: [PATCH 09/16] add comment on not-need to be 100% accurate about max_layer_size --- pageserver/src/disk_usage_eviction_task.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index c0ee42066b..719e69000f 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -506,6 +506,10 @@ async fn extend_lru_candidates( return ControlFlow::Break(()); } + // If one of the timelines becomes `!is_active()` during the iteration, + // for example because we're shutting down, then `max_layer_size` can be too small. + // That's OK. This code only runs under a disk pressure situation, and being + // a little unfair to tenants during shutdown in such a situation is tolerable. let mut max_layer_size: Option = None; for tl in tenant.list_timelines() { if !tl.is_active() { From 70c837a4b2c606ff36cf371b72195d3004039396 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 28 Mar 2023 12:30:07 +0300 Subject: [PATCH 10/16] refactor: simplify max_layer_size as u64 --- pageserver/src/disk_usage_eviction_task.rs | 29 ++++------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 719e69000f..6ae41c71d0 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -510,7 +510,7 @@ async fn extend_lru_candidates( // for example because we're shutting down, then `max_layer_size` can be too small. // That's OK. This code only runs under a disk pressure situation, and being // a little unfair to tenants during shutdown in such a situation is tolerable. - let mut max_layer_size: Option = None; + let mut max_layer_size = 0; for tl in tenant.list_timelines() { if !tl.is_active() { continue; @@ -522,11 +522,7 @@ async fn extend_lru_candidates( .into_iter() .map(|layer_infos| (tl.clone(), layer_infos)), ); - max_layer_size = match (max_layer_size, info.max_layer_size) { - (Some(x), Some(y)) => Some(x.max(y)), - (Some(only), None) | (None, Some(only)) => Some(only), - (None, None) => None, - }; + max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); if cancel.is_cancelled() { return ControlFlow::Break(()); @@ -538,24 +534,9 @@ async fn extend_lru_candidates( lru_candidates.append(scratch); return ControlFlow::Continue(()); } - Mode::RespectTenantMinResidentSize => { - match tenant.get_min_resident_size_override().or(max_layer_size) { - Some(size) => size, - None => { - // the tenant has no layers at all. it's very unlikely but allowed by the - // types. - if !scratch.is_empty() { - // soft assert - warn!( - layers = scratch.len(), - "BUG: no maximum layer size, but still found layers" - ); - scratch.clear(); - } - return ControlFlow::Continue(()); - } - } - } + Mode::RespectTenantMinResidentSize => tenant + .get_min_resident_size_override() + .unwrap_or(max_layer_size), }; scratch.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts); From 54cc1d5064e1ab9cb80a9f65d656a6dc944e6a34 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 11:34:44 +0200 Subject: [PATCH 11/16] doc: sub-headings for mechanics & policy in module comment --- pageserver/src/disk_usage_eviction_task.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 6ae41c71d0..7b8b5ca95c 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -1,5 +1,7 @@ //! This module implements the pageserver-global disk-usage-based layer eviction task. //! +//! # Mechanics +//! //! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background //! loop that evicts layers in response to a shortage of available bytes //! in the $repo/tenants directory's filesystem. @@ -13,6 +15,8 @@ //! We're good if that second statvfs shows that we're _actually_ below the configured thresholds. //! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. //! +//! # Eviction Policy +//! //! There are two thresholds: //! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. //! If the actual usage is higher, the threshold is exceeded. From 0056108c456ff0be07d57a8cbc3f05cbb2c56e23 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 11:36:02 +0200 Subject: [PATCH 12/16] doc: remove stray comment --- pageserver/src/disk_usage_eviction_task.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 7b8b5ca95c..607898c6f5 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -298,7 +298,6 @@ pub async fn disk_usage_eviction_task_iteration_impl( let mut usage_planned_global_lru = None; // achieved post-eviction usage according to internal accounting let mut usage_assumed = usage_pre; - // actual usage read after batched evictions debug!(?usage_pre, "disk usage"); From 42d63270a57b89a520a1f78bac961764a1623ba4 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 11:42:47 +0200 Subject: [PATCH 13/16] doc: add comment on extend_lru_candidates --- pageserver/src/disk_usage_eviction_task.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 607898c6f5..9dc1340d2e 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -486,6 +486,11 @@ enum Mode { GlobalLru, } +/// Figure out eviction candidates for the given tenant and append them to `lru_candidates`. +/// +/// The `scratch` vector is temporary storage and taken as an argument to avoid allocations. +/// It must be empty when calling this function. It is guaranteed to be empty when we +/// return `ControlFlow::Continue`. #[instrument(skip_all, fields(?mode, %tenant_id))] async fn extend_lru_candidates( mode: Mode, From 0428b6822a0b4646734fb52e9b047de668704774 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 11:53:48 +0200 Subject: [PATCH 14/16] doc: more comment on lru_candidates to address questions from review --- pageserver/src/disk_usage_eviction_task.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 9dc1340d2e..eef64ae51c 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -488,6 +488,12 @@ enum Mode { /// Figure out eviction candidates for the given tenant and append them to `lru_candidates`. /// +/// The `mode` argument controls which layers get appended to `lru_candidates`. +/// Read its type's doc comments for more details. +/// +/// The caller is responsible for sorting `lru_candidates` once it has called this function +/// for all tenants. +/// /// The `scratch` vector is temporary storage and taken as an argument to avoid allocations. /// It must be empty when calling this function. It is guaranteed to be empty when we /// return `ControlFlow::Continue`. From 453c3fd2da34623dbbd3dbf7cb05a5a2818fe8d0 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 12:22:26 +0200 Subject: [PATCH 15/16] doc: fix typo Co-authored-by: Heikki Linnakangas --- libs/utils/src/serde_percent.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs index 386f34460e..63b62b5f1e 100644 --- a/libs/utils/src/serde_percent.rs +++ b/libs/utils/src/serde_percent.rs @@ -1,4 +1,4 @@ -//! A serde::Desierialize type for percentages. +//! A serde::Deserialize type for percentages. //! //! See [`Percent`] for details. From 18ed0f9a066212b564c596bf604d6738d53f9ea1 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 12:44:40 +0200 Subject: [PATCH 16/16] enable disk-usage-based eviction in staging We'll do some manual testing of the statvfs loop there. --- .github/ansible/staging.eu-west-1.hosts.yaml | 8 ++++++++ .github/ansible/staging.us-east-2.hosts.yaml | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index b537795704..e8d0bb1dc7 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,6 +8,14 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + # TODO: learn typical resident-size growth rate [GiB/minute] and configure + # min_avail_bytes such that we have X minutes of headroom. + min_avail_bytes: 0 + # We assume that the worst-case growth rate is small enough that we can + # catch above-threshold conditions by checking every 10s. + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index cd8f832af0..4ef51651fc 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,6 +8,14 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + # TODO: learn typical resident-size growth rate [GiB/minute] and configure + # min_avail_bytes such that we have X minutes of headroom. + min_avail_bytes: 0 + # We assume that the worst-case growth rate is small enough that we can + # catch above-threshold conditions by checking every 10s. + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold"