diff --git a/Cargo.lock b/Cargo.lock index 51c433cd07..abb335e97c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3552,6 +3552,7 @@ dependencies = [ "const_format", "enum-map", "hex", + "humantime", "humantime-serde", "itertools", "postgres_ffi", diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 938910caea..3bba89c76d 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -18,6 +18,7 @@ enum-map.workspace = true strum.workspace = true strum_macros.workspace = true hex.workspace = true +humantime.workspace = true thiserror.workspace = true humantime-serde.workspace = true chrono.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index b68ab9fd59..36aafe7341 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,4 +1,7 @@ pub mod partitioning; +pub mod utilization; + +pub use utilization::PageserverUtilization; use std::{ collections::HashMap, diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs new file mode 100644 index 0000000000..7195a12395 --- /dev/null +++ b/libs/pageserver_api/src/models/utilization.rs @@ -0,0 +1,70 @@ +use std::time::SystemTime; + +/// Pageserver current utilization and scoring for how good candidate the pageserver would be for +/// the next tenant. +/// +/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth. +/// +/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might +/// not handle full u64 values properly. +#[derive(serde::Serialize, Debug)] +pub struct PageserverUtilization { + /// Used disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub disk_usage_bytes: u64, + /// Free disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub free_space_bytes: u64, + /// Lower is better score for how good candidate for a next tenant would this pageserver be. + #[serde(serialize_with = "ser_saturating_u63")] + pub utilization_score: u64, + /// When was this snapshot captured, pageserver local time. + /// + /// Use millis to give confidence that the value is regenerated often enough. + #[serde(serialize_with = "ser_rfc3339_millis")] + pub captured_at: SystemTime, +} + +fn ser_rfc3339_millis( + ts: &SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. +/// +/// Instead of newtype, use this because a newtype would get require handling deserializing values +/// with the highest bit set which is properly parsed by serde formats, but would create a +/// conundrum on how to handle and again serialize such values at type level. It will be a few +/// years until we can use more than `i64::MAX` bytes on a disk. +fn ser_saturating_u63(value: &u64, serializer: S) -> Result { + const MAX_FORMAT_INT64: u64 = i64::MAX as u64; + + let value = (*value).min(MAX_FORMAT_INT64); + + serializer.serialize_u64(value) +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::*; + + #[test] + fn u64_max_is_serialized_as_u63_max() { + let doc = PageserverUtilization { + disk_usage_bytes: u64::MAX, + free_space_bytes: 0, + utilization_score: u64::MAX, + captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + }; + + let s = serde_json::to_string(&doc).unwrap(); + + let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#; + + assert_eq!(s, expected); + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index a6fe7c67e1..479c7ca0f5 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1379,6 +1379,25 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/utilization: + get: + description: | + Returns the pageservers current utilization and fitness score for new tenants. + + responses: + "200": + description: Pageserver utilization and fitness score + content: + application/json: + schema: + $ref: "#/components/schemas/PageserverUtilization" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + components: securitySchemes: JWT: @@ -1691,6 +1710,33 @@ components: type: string enum: [past, present, future, nodata] + PageserverUtilization: + type: object + required: + - disk_usage_bytes + - free_space_bytes + - utilization_score + properties: + disk_usage_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of disk space currently utilized by layer files. + free_space_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of usable disk space left. + utilization_score: + type: integer + format: int64 + minimum: 0 + maximum: 9223372036854775807 + default: 9223372036854775807 + description: | + Lower is better score for how good this pageserver would be for the next tenant. + The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated. + Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 175353762c..1339229a70 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -100,6 +100,7 @@ pub struct State { disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, + latest_utilization: tokio::sync::Mutex>, } impl State { @@ -128,6 +129,7 @@ impl State { disk_usage_eviction_state, deletion_queue_client, secondary_controller, + latest_utilization: Default::default(), }) } } @@ -1963,6 +1965,54 @@ async fn put_io_engine_handler( json_response(StatusCode::OK, ()) } +/// Polled by control plane. +/// +/// See [`crate::utilization`]. +async fn get_utilization( + r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + // this probably could be completely public, but lets make that change later. + check_permission(&r, None)?; + + let state = get_state(&r); + let mut g = state.latest_utilization.lock().await; + + let regenerate_every = Duration::from_secs(1); + let still_valid = g + .as_ref() + .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every); + + // avoid needless statvfs calls even though those should be non-blocking fast. + // regenerate at most 1Hz to allow polling at any rate. + if !still_valid { + let path = state.conf.tenants_path(); + let doc = crate::utilization::regenerate(path.as_std_path()) + .map_err(ApiError::InternalServerError)?; + + let mut buf = Vec::new(); + serde_json::to_writer(&mut buf, &doc) + .context("serialize") + .map_err(ApiError::InternalServerError)?; + + let body = bytes::Bytes::from(buf); + + *g = Some((std::time::Instant::now(), body)); + } + + // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork + let cached = g.as_ref().expect("just set").1.clone(); + + Response::builder() + .header(hyper::http::header::CONTENT_TYPE, "application/json") + // thought of using http date header, but that is second precision which does not give any + // debugging aid + .status(StatusCode::OK) + .body(hyper::Body::from(cached)) + .context("build response") + .map_err(ApiError::InternalServerError) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -2224,5 +2274,6 @@ pub fn make_router( |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .get("/v1/utilization", |r| api_handler(r, get_utilization)) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c3f35142ec..cf6856458a 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -22,6 +22,7 @@ pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod trace; +pub mod utilization; pub mod virtual_file; pub mod walingest; pub mod walrecord; diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs new file mode 100644 index 0000000000..830c9897ca --- /dev/null +++ b/pageserver/src/utilization.rs @@ -0,0 +1,38 @@ +//! An utilization metric which is used to decide on which pageserver to put next tenant. +//! +//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the +//! truth. + +use anyhow::Context; +use std::path::Path; + +use pageserver_api::models::PageserverUtilization; + +pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result { + // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough + + let statvfs = nix::sys::statvfs::statvfs(tenants_path) + .map_err(std::io::Error::from) + .context("statvfs tenants directory")?; + + let blocksz = statvfs.block_size(); + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let free = statvfs.blocks_available() as u64 * blocksz; + let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get(); + let captured_at = std::time::SystemTime::now(); + + let doc = PageserverUtilization { + disk_usage_bytes: used, + free_space_bytes: free, + // lower is better; start with a constant + // + // note that u64::MAX will be output as i64::MAX as u64, but that should not matter + utilization_score: u64::MAX, + captured_at, + }; + + // TODO: make utilization_score into a metric + + Ok(doc) +}