diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs index e88cab5d6a..0fec221276 100644 --- a/libs/pageserver_api/src/models/utilization.rs +++ b/libs/pageserver_api/src/models/utilization.rs @@ -1,4 +1,5 @@ -use utils::serde_system_time::SystemTime; +use std::time::SystemTime; +use utils::{serde_percent::Percent, serde_system_time}; /// Pageserver current utilization and scoring for how good candidate the pageserver would be for /// the next tenant. @@ -9,19 +10,88 @@ use utils::serde_system_time::SystemTime; /// not handle full u64 values properly. #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] pub struct PageserverUtilization { - /// Used disk space + /// Used disk space (physical, ground truth from statfs()) #[serde(serialize_with = "ser_saturating_u63")] pub disk_usage_bytes: u64, /// Free disk space #[serde(serialize_with = "ser_saturating_u63")] pub free_space_bytes: u64, - /// Lower is better score for how good candidate for a next tenant would this pageserver be. - #[serde(serialize_with = "ser_saturating_u63")] + + /// Wanted disk space, based on the tenant shards currently present on this pageserver: this + /// is like disk_usage_bytes, but it is stable and does not change with the cache state of + /// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay + /// there, or may be unrealistically low if the pageserver has attached tenants which haven't + /// downloaded layers yet. + #[serde(serialize_with = "ser_saturating_u63", default)] + pub disk_wanted_bytes: u64, + + // What proportion of total disk space will this pageserver use before it starts evicting data? + #[serde(default = "unity_percent")] + pub disk_usable_pct: Percent, + + // How many shards are currently on this node? + #[serde(default)] + pub shard_count: u32, + + // How many shards should this node be able to handle at most? + #[serde(default)] + pub max_shard_count: u32, + + /// Cached result of [`Self::score`] pub utilization_score: u64, + /// When was this snapshot captured, pageserver local time. /// /// Use millis to give confidence that the value is regenerated often enough. - pub captured_at: SystemTime, + pub captured_at: serde_system_time::SystemTime, +} + +fn unity_percent() -> Percent { + Percent::new(0).unwrap() +} + +impl PageserverUtilization { + const UTILIZATION_FULL: u64 = 1000000; + + /// Calculate a utilization score. The result is to be inrepreted as a fraction of + /// Self::UTILIZATION_FULL. + /// + /// Lower values are more affine to scheduling more work on this node. + /// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work. + /// - 0.0 represents an empty node. + /// - Negative values are forbidden + /// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to + /// layer eviction. + pub fn score(&self) -> u64 { + let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes) + * self.disk_usable_pct.get() as u64) + / 100; + let disk_utilization_score = + self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity; + + let shard_utilization_score = + self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64; + std::cmp::max(disk_utilization_score, shard_utilization_score) + } + + pub fn refresh_score(&mut self) { + self.utilization_score = self.score(); + } + + /// A utilization structure that has a full utilization score: use this as a placeholder when + /// you need a utilization but don't have real values yet. + pub fn full() -> Self { + Self { + disk_usage_bytes: 1, + free_space_bytes: 0, + disk_wanted_bytes: 1, + disk_usable_pct: Percent::new(100).unwrap(), + shard_count: 1, + max_shard_count: 1, + utilization_score: Self::UTILIZATION_FULL, + captured_at: serde_system_time::SystemTime(SystemTime::now()), + } + } } /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. @@ -49,15 +119,19 @@ mod tests { let doc = PageserverUtilization { disk_usage_bytes: u64::MAX, free_space_bytes: 0, - utilization_score: u64::MAX, - captured_at: SystemTime( + disk_wanted_bytes: u64::MAX, + utilization_score: 13, + disk_usable_pct: Percent::new(90).unwrap(), + shard_count: 100, + max_shard_count: 200, + captured_at: serde_system_time::SystemTime( std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), ), }; let s = serde_json::to_string(&doc).unwrap(); - let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#; + let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}"; assert_eq!(s, expected); } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a983d8c4c2..2b0156079e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2357,8 +2357,9 @@ async fn get_utilization( // regenerate at most 1Hz to allow polling at any rate. if !still_valid { let path = state.conf.tenants_path(); - let doc = crate::utilization::regenerate(path.as_std_path()) - .map_err(ApiError::InternalServerError)?; + let doc = + crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager) + .map_err(ApiError::InternalServerError)?; let mut buf = Vec::new(); serde_json::to_writer(&mut buf, &doc) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cfdb32f755..a238004aad 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3732,6 +3732,19 @@ impl Tenant { pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { self.tenant_conf.load().tenant_conf.clone() } + + /// How much local storage would this tenant like to have? It can cope with + /// less than this (via eviction and on-demand downloads), but this function enables + /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O + /// by keeping important things on local disk. + pub(crate) fn local_storage_wanted(&self) -> u64 { + let mut wanted = 0; + let timelines = self.timelines.lock().unwrap(); + for timeline in timelines.values() { + wanted += timeline.metrics.visible_physical_size_gauge.get(); + } + wanted + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 3316627540..c8a11e88cc 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2085,6 +2085,57 @@ impl TenantManager { } } } + + /// Calculate the tenant shards' contributions to this pageserver's utilization metrics. The + /// returned values are: + /// - the number of bytes of local disk space this pageserver's shards are requesting, i.e. + /// how much space they would use if not impacted by disk usage eviction. + /// - the number of tenant shards currently on this pageserver, including attached + /// and secondary. + /// + /// This function is quite expensive: callers are expected to cache the result and + /// limit how often they call it. + pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> { + let tenants = self.tenants.read().unwrap(); + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + let shard_count = m.len(); + let mut wanted_bytes = 0; + + for tenant_slot in m.values() { + match tenant_slot { + TenantSlot::InProgress(_barrier) => { + // While a slot is being changed, we can't know how much storage it wants. This + // means this function's output can fluctuate if a lot of changes are going on + // (such as transitions from secondary to attached). + // + // We could wait for the barrier and retry, but it's important that the utilization + // API is responsive, and the data quality impact is not very significant. + continue; + } + TenantSlot::Attached(tenant) => { + wanted_bytes += tenant.local_storage_wanted(); + } + TenantSlot::Secondary(secondary) => { + let progress = secondary.progress.lock().unwrap(); + wanted_bytes += if progress.heatmap_mtime.is_some() { + // If we have heatmap info, then we will 'want' the sum + // of the size of layers in the heatmap: this is how much space + // we would use if not doing any eviction. + progress.bytes_total + } else { + // In the absence of heatmap info, assume that the secondary location simply + // needs as much space as it is currently using. + secondary.resident_size_metric.get() + } + } + } + } + + Ok((wanted_bytes, shard_count as u32)) + } } #[derive(Debug, thiserror::Error)] diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index e6c835aa75..3c48c84598 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -5,12 +5,17 @@ use anyhow::Context; use std::path::Path; +use utils::serde_percent::Percent; use pageserver_api::models::PageserverUtilization; -pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result { - // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough +use crate::{config::PageServerConf, tenant::mgr::TenantManager}; +pub(crate) fn regenerate( + conf: &PageServerConf, + tenants_path: &Path, + tenant_manager: &TenantManager, +) -> anyhow::Result { let statvfs = nix::sys::statvfs::statvfs(tenants_path) .map_err(std::io::Error::from) .context("statvfs tenants directory")?; @@ -34,16 +39,31 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result e.max_usage_pct, + None => Percent::new(100).unwrap(), + }; + + // Express a static value for how many shards we may schedule on one node + const MAX_SHARDS: u32 = 20000; + + let mut doc = PageserverUtilization { disk_usage_bytes: used, free_space_bytes: free, - // lower is better; start with a constant - // - // note that u64::MAX will be output as i64::MAX as u64, but that should not matter - utilization_score: u64::MAX, + disk_wanted_bytes, + disk_usable_pct, + shard_count, + max_shard_count: MAX_SHARDS, + utilization_score: 0, captured_at: utils::serde_system_time::SystemTime(captured_at), }; + doc.refresh_score(); + // TODO: make utilization_score into a metric Ok(doc)