From bc7a82caf2d56b6ee6ce80ece76aeb100d276e31 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 22 Feb 2024 13:58:59 +0200 Subject: [PATCH] feat: bare-bones /v1/utilization (#6831) PR adds a simple at most 1Hz refreshed informational API for querying pageserver utilization. In this first phase, no actual background calculation is performed. Instead, the worst possible score is always returned. The returned bytes information is however correct. Cc: #6835 Cc: #5331 --- Cargo.lock | 1 + libs/pageserver_api/Cargo.toml | 1 + libs/pageserver_api/src/models.rs | 3 + libs/pageserver_api/src/models/utilization.rs | 70 +++++++++++++++++++ pageserver/src/http/openapi_spec.yml | 46 ++++++++++++ pageserver/src/http/routes.rs | 51 ++++++++++++++ pageserver/src/lib.rs | 1 + pageserver/src/utilization.rs | 38 ++++++++++ 8 files changed, 211 insertions(+) create mode 100644 libs/pageserver_api/src/models/utilization.rs create mode 100644 pageserver/src/utilization.rs diff --git a/Cargo.lock b/Cargo.lock index 51c433cd07..abb335e97c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3552,6 +3552,7 @@ dependencies = [ "const_format", "enum-map", "hex", + "humantime", "humantime-serde", "itertools", "postgres_ffi", diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 938910caea..3bba89c76d 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -18,6 +18,7 @@ enum-map.workspace = true strum.workspace = true strum_macros.workspace = true hex.workspace = true +humantime.workspace = true thiserror.workspace = true humantime-serde.workspace = true chrono.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index b68ab9fd59..36aafe7341 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,4 +1,7 @@ pub mod partitioning; +pub mod utilization; + +pub use utilization::PageserverUtilization; use std::{ collections::HashMap, diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs new file mode 100644 index 0000000000..7195a12395 --- /dev/null +++ b/libs/pageserver_api/src/models/utilization.rs @@ -0,0 +1,70 @@ +use std::time::SystemTime; + +/// Pageserver current utilization and scoring for how good candidate the pageserver would be for +/// the next tenant. +/// +/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth. +/// +/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might +/// not handle full u64 values properly. +#[derive(serde::Serialize, Debug)] +pub struct PageserverUtilization { + /// Used disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub disk_usage_bytes: u64, + /// Free disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub free_space_bytes: u64, + /// Lower is better score for how good candidate for a next tenant would this pageserver be. + #[serde(serialize_with = "ser_saturating_u63")] + pub utilization_score: u64, + /// When was this snapshot captured, pageserver local time. + /// + /// Use millis to give confidence that the value is regenerated often enough. + #[serde(serialize_with = "ser_rfc3339_millis")] + pub captured_at: SystemTime, +} + +fn ser_rfc3339_millis( + ts: &SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. +/// +/// Instead of newtype, use this because a newtype would get require handling deserializing values +/// with the highest bit set which is properly parsed by serde formats, but would create a +/// conundrum on how to handle and again serialize such values at type level. It will be a few +/// years until we can use more than `i64::MAX` bytes on a disk. +fn ser_saturating_u63(value: &u64, serializer: S) -> Result { + const MAX_FORMAT_INT64: u64 = i64::MAX as u64; + + let value = (*value).min(MAX_FORMAT_INT64); + + serializer.serialize_u64(value) +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::*; + + #[test] + fn u64_max_is_serialized_as_u63_max() { + let doc = PageserverUtilization { + disk_usage_bytes: u64::MAX, + free_space_bytes: 0, + utilization_score: u64::MAX, + captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + }; + + let s = serde_json::to_string(&doc).unwrap(); + + let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#; + + assert_eq!(s, expected); + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index a6fe7c67e1..479c7ca0f5 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1379,6 +1379,25 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/utilization: + get: + description: | + Returns the pageservers current utilization and fitness score for new tenants. + + responses: + "200": + description: Pageserver utilization and fitness score + content: + application/json: + schema: + $ref: "#/components/schemas/PageserverUtilization" + "500": + description: Generic operation error + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + components: securitySchemes: JWT: @@ -1691,6 +1710,33 @@ components: type: string enum: [past, present, future, nodata] + PageserverUtilization: + type: object + required: + - disk_usage_bytes + - free_space_bytes + - utilization_score + properties: + disk_usage_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of disk space currently utilized by layer files. + free_space_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of usable disk space left. + utilization_score: + type: integer + format: int64 + minimum: 0 + maximum: 9223372036854775807 + default: 9223372036854775807 + description: | + Lower is better score for how good this pageserver would be for the next tenant. + The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated. + Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 175353762c..1339229a70 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -100,6 +100,7 @@ pub struct State { disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, + latest_utilization: tokio::sync::Mutex>, } impl State { @@ -128,6 +129,7 @@ impl State { disk_usage_eviction_state, deletion_queue_client, secondary_controller, + latest_utilization: Default::default(), }) } } @@ -1963,6 +1965,54 @@ async fn put_io_engine_handler( json_response(StatusCode::OK, ()) } +/// Polled by control plane. +/// +/// See [`crate::utilization`]. +async fn get_utilization( + r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + // this probably could be completely public, but lets make that change later. + check_permission(&r, None)?; + + let state = get_state(&r); + let mut g = state.latest_utilization.lock().await; + + let regenerate_every = Duration::from_secs(1); + let still_valid = g + .as_ref() + .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every); + + // avoid needless statvfs calls even though those should be non-blocking fast. + // regenerate at most 1Hz to allow polling at any rate. + if !still_valid { + let path = state.conf.tenants_path(); + let doc = crate::utilization::regenerate(path.as_std_path()) + .map_err(ApiError::InternalServerError)?; + + let mut buf = Vec::new(); + serde_json::to_writer(&mut buf, &doc) + .context("serialize") + .map_err(ApiError::InternalServerError)?; + + let body = bytes::Bytes::from(buf); + + *g = Some((std::time::Instant::now(), body)); + } + + // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork + let cached = g.as_ref().expect("just set").1.clone(); + + Response::builder() + .header(hyper::http::header::CONTENT_TYPE, "application/json") + // thought of using http date header, but that is second precision which does not give any + // debugging aid + .status(StatusCode::OK) + .body(hyper::Body::from(cached)) + .context("build response") + .map_err(ApiError::InternalServerError) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -2224,5 +2274,6 @@ pub fn make_router( |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .get("/v1/utilization", |r| api_handler(r, get_utilization)) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index c3f35142ec..cf6856458a 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -22,6 +22,7 @@ pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod trace; +pub mod utilization; pub mod virtual_file; pub mod walingest; pub mod walrecord; diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs new file mode 100644 index 0000000000..830c9897ca --- /dev/null +++ b/pageserver/src/utilization.rs @@ -0,0 +1,38 @@ +//! An utilization metric which is used to decide on which pageserver to put next tenant. +//! +//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the +//! truth. + +use anyhow::Context; +use std::path::Path; + +use pageserver_api::models::PageserverUtilization; + +pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result { + // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough + + let statvfs = nix::sys::statvfs::statvfs(tenants_path) + .map_err(std::io::Error::from) + .context("statvfs tenants directory")?; + + let blocksz = statvfs.block_size(); + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let free = statvfs.blocks_available() as u64 * blocksz; + let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get(); + let captured_at = std::time::SystemTime::now(); + + let doc = PageserverUtilization { + disk_usage_bytes: used, + free_space_bytes: free, + // lower is better; start with a constant + // + // note that u64::MAX will be output as i64::MAX as u64, but that should not matter + utilization_score: u64::MAX, + captured_at, + }; + + // TODO: make utilization_score into a metric + + Ok(doc) +}