diff --git a/CODEOWNERS b/CODEOWNERS index 5b601f0566..9a23e8c958 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,12 +1,13 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /control_plane/attachment_service @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage -/libs/postgres_ffi/ @neondatabase/compute +/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers /libs/remote_storage/ @neondatabase/storage /libs/safekeeper_api/ @neondatabase/safekeepers /libs/vm_monitor/ @neondatabase/autoscaling /pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute +/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers /proxy/ @neondatabase/proxy /safekeeper/ @neondatabase/safekeepers /vendor/ @neondatabase/compute diff --git a/Cargo.lock b/Cargo.lock index 625958fdb8..9930a6c323 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -277,6 +277,7 @@ dependencies = [ "anyhow", "aws-config", "aws-sdk-secretsmanager", + "bytes", "camino", "clap", "control_plane", @@ -288,6 +289,8 @@ dependencies = [ "hex", "humantime", "hyper", + "lasso", + "measured", "metrics", "once_cell", "pageserver_api", @@ -295,6 +298,7 @@ dependencies = [ "postgres_connection", "r2d2", "reqwest", + "routerify", "serde", "serde_json", "thiserror", @@ -2880,6 +2884,35 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" +[[package]] +name = "measured" +version = "0.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f246648d027839a34b420e27c7de1165ace96e19ef894985d0a6ff89a7840a9f" +dependencies = [ + "bytes", + "hashbrown 0.14.0", + "itoa", + "lasso", + "measured-derive", + "memchr", + "parking_lot 0.12.1", + "rustc-hash", + "ryu", +] + +[[package]] +name = "measured-derive" +version = "0.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edaa5cc22d99d5d6d7d99c3b5b5f7e7f8034c22f1b5d62a1adecd2ed005d9b80" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "memchr" version = "2.6.4" @@ -3901,7 +3934,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -3914,7 +3947,7 @@ dependencies = [ [[package]] name = "postgres-native-tls" version = "0.5.0" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "native-tls", "tokio", @@ -3925,7 +3958,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "base64 0.20.0", "byteorder", @@ -3938,12 +3971,13 @@ dependencies = [ "rand 0.8.5", "sha2", "stringprep", + "tokio", ] [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -4225,6 +4259,7 @@ dependencies = [ "smallvec", "smol_str", "socket2 0.5.5", + "subtle", "sync_wrapper", "task-local-extensions", "thiserror", @@ -5945,7 +5980,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "async-trait", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 3f98c1946c..65308db3c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,6 +101,7 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" +measured = { version = "0.0.13", features=["default", "lasso"] } memoffset = "0.8" native-tls = "0.2" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } @@ -148,6 +149,7 @@ smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" strum = "0.24" strum_macros = "0.24" +"subtle" = "2.5.0" svg_fmt = "0.4.1" sync_wrapper = "0.1.2" tar = "0.4" diff --git a/compute_tools/README.md b/compute_tools/README.md index 22a7de7cb7..8d84031efc 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \ -b /usr/local/bin/postgres ``` +## State Diagram + +Computes can be in various states. Below is a diagram that details how a +compute moves between states. + +```mermaid +%% https://mermaid.js.org/syntax/stateDiagram.html +stateDiagram-v2 + [*] --> Empty : Compute spawned + Empty --> ConfigurationPending : Waiting for compute spec + ConfigurationPending --> Configuration : Received compute spec + Configuration --> Failed : Failed to configure the compute + Configuration --> Running : Compute has been configured + Empty --> Init : Compute spec is immediately available + Empty --> TerminationPending : Requested termination + Init --> Failed : Failed to start Postgres + Init --> Running : Started Postgres + Running --> TerminationPending : Requested termination + TerminationPending --> Terminated : Terminated compute + Failed --> [*] : Compute exited + Terminated --> [*] : Compute exited +``` + ## Tests Cargo formatter: diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml index f78f56c480..34882659e3 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/control_plane/attachment_service/Cargo.toml @@ -17,6 +17,7 @@ testing = [] anyhow.workspace = true aws-config.workspace = true aws-sdk-secretsmanager.workspace = true +bytes.workspace = true camino.workspace = true clap.workspace = true fail.workspace = true @@ -25,17 +26,20 @@ git-version.workspace = true hex.workspace = true hyper.workspace = true humantime.workspace = true +lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true reqwest.workspace = true +routerify.workspace = true serde.workspace = true serde_json.workspace = true thiserror.workspace = true tokio.workspace = true tokio-util.workspace = true tracing.workspace = true +measured.workspace = true diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } diesel_migrations = { version = "2.1.0" } diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index 076b3a2f70..036019cd38 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -1,5 +1,11 @@ +use crate::metrics::{ + HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, + METRICS_REGISTRY, +}; use crate::reconciler::ReconcileError; use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use futures::Future; +use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; use pageserver_api::models::{ @@ -34,6 +40,8 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; +use routerify::Middleware; + /// State available to HTTP request handlers #[derive(Clone)] pub struct HttpState { @@ -313,7 +321,7 @@ async fn handle_tenant_timeline_passthrough( tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); // Find the node that holds shard zero - let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?; + let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?; // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. @@ -322,12 +330,39 @@ async fn handle_tenant_timeline_passthrough( let tenant_shard_str = format!("{}", tenant_shard_id); let path = path.replace(&tenant_str, &tenant_shard_str); - let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref()); + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_latency; + + // This is a bit awkward. We remove the param from the request + // and join the words by '_' to get a label for the request. + let just_path = path.replace(&tenant_shard_str, ""); + let path_label = just_path + .split('/') + .filter(|token| !token.is_empty()) + .collect::>() + .join("_"); + let labels = PageserverRequestLabelGroup { + pageserver_id: &node.get_id().to_string(), + path: &path_label, + method: crate::metrics::Method::Get, + }; + + let _timer = latency.start_timer(labels.clone()); + + let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); let resp = client.get_raw(path).await.map_err(|_e| // FIXME: give APiError a proper Unavailable variant. We return 503 here because // if we can't successfully send a request to the pageserver, we aren't available. ApiError::ShuttingDown)?; + if !resp.status().is_success() { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_error; + error_counter.inc(labels); + } + // We have a reqest::Response, would like a http::Response let mut builder = hyper::Response::builder() .status(resp.status()) @@ -498,7 +533,11 @@ impl From for ApiError { /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only /// be allowed to run if Service has finished its initial reconciliation. -async fn tenant_service_handler(request: Request, handler: H) -> R::Output +async fn tenant_service_handler( + request: Request, + handler: H, + request_name: RequestName, +) -> R::Output where R: std::future::Future, ApiError>> + Send + 'static, H: FnOnce(Arc, Request) -> R + Send + Sync + 'static, @@ -518,9 +557,10 @@ where )); } - request_span( + named_request_span( request, |request| async move { handler(service, request).await }, + request_name, ) .await } @@ -531,11 +571,98 @@ fn check_permissions(request: &Request, required_scope: Scope) -> Result<( }) } +#[derive(Clone, Debug)] +struct RequestMeta { + method: hyper::http::Method, + at: Instant, +} + +fn prologue_metrics_middleware( +) -> Middleware { + Middleware::pre(move |req| async move { + let meta = RequestMeta { + method: req.method().clone(), + at: Instant::now(), + }; + + req.set_context(meta); + + Ok(req) + }) +} + +fn epilogue_metrics_middleware( +) -> Middleware { + Middleware::post_with_info(move |resp, req_info| async move { + let request_name = match req_info.context::() { + Some(name) => name, + None => { + return Ok(resp); + } + }; + + if let Some(meta) = req_info.context::() { + let status = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_status; + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_latency; + + status.inc(HttpRequestStatusLabelGroup { + path: request_name.0, + method: meta.method.clone().into(), + status: crate::metrics::StatusCode(resp.status()), + }); + + latency.observe( + HttpRequestLatencyLabelGroup { + path: request_name.0, + method: meta.method.into(), + }, + meta.at.elapsed().as_secs_f64(), + ); + } + Ok(resp) + }) +} + +pub async fn measured_metrics_handler(_req: Request) -> Result, ApiError> { + pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; + + let payload = crate::metrics::METRICS_REGISTRY.encode(); + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, TEXT_FORMAT) + .body(payload.into()) + .unwrap(); + + Ok(response) +} + +#[derive(Clone)] +struct RequestName(&'static str); + +async fn named_request_span( + request: Request, + handler: H, + name: RequestName, +) -> R::Output +where + R: Future, ApiError>> + Send + 'static, + H: FnOnce(Request) -> R + Send + Sync + 'static, +{ + request.set_context(name); + request_span(request, handler).await +} + pub fn make_router( service: Arc, auth: Option>, ) -> RouterBuilder { - let mut router = endpoint::make_router(); + let mut router = endpoint::make_router() + .middleware(prologue_metrics_middleware()) + .middleware(epilogue_metrics_middleware()); if auth.is_some() { router = router.middleware(auth_middleware(|request| { let state = get_state(request); @@ -544,99 +671,166 @@ pub fn make_router( } else { state.auth.as_deref() } - })) + })); } router .data(Arc::new(HttpState::new(service, auth))) + .get("/metrics", |r| { + named_request_span(r, measured_metrics_handler, RequestName("metrics")) + }) // Non-prefixed generic endpoints (status, metrics) - .get("/status", |r| request_span(r, handle_status)) - .get("/ready", |r| request_span(r, handle_ready)) + .get("/status", |r| { + named_request_span(r, handle_status, RequestName("status")) + }) + .get("/ready", |r| { + named_request_span(r, handle_ready, RequestName("ready")) + }) // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix .post("/upcall/v1/re-attach", |r| { - request_span(r, handle_re_attach) + named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach")) + }) + .post("/upcall/v1/validate", |r| { + named_request_span(r, handle_validate, RequestName("upcall_v1_validate")) }) - .post("/upcall/v1/validate", |r| request_span(r, handle_validate)) // Test/dev/debug endpoints .post("/debug/v1/attach-hook", |r| { - request_span(r, handle_attach_hook) + named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook")) + }) + .post("/debug/v1/inspect", |r| { + named_request_span(r, handle_inspect, RequestName("debug_v1_inspect")) }) - .post("/debug/v1/inspect", |r| request_span(r, handle_inspect)) .post("/debug/v1/tenant/:tenant_id/drop", |r| { - request_span(r, handle_tenant_drop) + named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop")) }) .post("/debug/v1/node/:node_id/drop", |r| { - request_span(r, handle_node_drop) + named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop")) + }) + .get("/debug/v1/tenant", |r| { + named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant")) }) - .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump)) .get("/debug/v1/tenant/:tenant_id/locate", |r| { - tenant_service_handler(r, handle_tenant_locate) + tenant_service_handler( + r, + handle_tenant_locate, + RequestName("debug_v1_tenant_locate"), + ) }) .get("/debug/v1/scheduler", |r| { - request_span(r, handle_scheduler_dump) + named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) }) .post("/debug/v1/consistency_check", |r| { - request_span(r, handle_consistency_check) + named_request_span( + r, + handle_consistency_check, + RequestName("debug_v1_consistency_check"), + ) }) .put("/debug/v1/failpoints", |r| { request_span(r, |r| failpoints_handler(r, CancellationToken::new())) }) // Node operations .post("/control/v1/node", |r| { - request_span(r, handle_node_register) + named_request_span(r, handle_node_register, RequestName("control_v1_node")) + }) + .get("/control/v1/node", |r| { + named_request_span(r, handle_node_list, RequestName("control_v1_node")) }) - .get("/control/v1/node", |r| request_span(r, handle_node_list)) .put("/control/v1/node/:node_id/config", |r| { - request_span(r, handle_node_configure) + named_request_span( + r, + handle_node_configure, + RequestName("control_v1_node_config"), + ) }) // Tenant Shard operations .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { - tenant_service_handler(r, handle_tenant_shard_migrate) + tenant_service_handler( + r, + handle_tenant_shard_migrate, + RequestName("control_v1_tenant_migrate"), + ) }) .put("/control/v1/tenant/:tenant_id/shard_split", |r| { - tenant_service_handler(r, handle_tenant_shard_split) + tenant_service_handler( + r, + handle_tenant_shard_split, + RequestName("control_v1_tenant_shard_split"), + ) }) .get("/control/v1/tenant/:tenant_id", |r| { - tenant_service_handler(r, handle_tenant_describe) + tenant_service_handler( + r, + handle_tenant_describe, + RequestName("control_v1_tenant_describe"), + ) }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. .post("/v1/tenant", |r| { - tenant_service_handler(r, handle_tenant_create) + tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant")) }) .delete("/v1/tenant/:tenant_id", |r| { - tenant_service_handler(r, handle_tenant_delete) + tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant")) }) .put("/v1/tenant/config", |r| { - tenant_service_handler(r, handle_tenant_config_set) + tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config")) }) .get("/v1/tenant/:tenant_id/config", |r| { - tenant_service_handler(r, handle_tenant_config_get) + tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config")) }) .put("/v1/tenant/:tenant_shard_id/location_config", |r| { - tenant_service_handler(r, handle_tenant_location_config) + tenant_service_handler( + r, + handle_tenant_location_config, + RequestName("v1_tenant_location_config"), + ) }) .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| { - tenant_service_handler(r, handle_tenant_time_travel_remote_storage) + tenant_service_handler( + r, + handle_tenant_time_travel_remote_storage, + RequestName("v1_tenant_time_travel_remote_storage"), + ) }) .post("/v1/tenant/:tenant_id/secondary/download", |r| { - tenant_service_handler(r, handle_tenant_secondary_download) + tenant_service_handler( + r, + handle_tenant_secondary_download, + RequestName("v1_tenant_secondary_download"), + ) }) // Timeline operations .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - tenant_service_handler(r, handle_tenant_timeline_delete) + tenant_service_handler( + r, + handle_tenant_timeline_delete, + RequestName("v1_tenant_timeline"), + ) }) .post("/v1/tenant/:tenant_id/timeline", |r| { - tenant_service_handler(r, handle_tenant_timeline_create) + tenant_service_handler( + r, + handle_tenant_timeline_create, + RequestName("v1_tenant_timeline"), + ) }) // Tenant detail GET passthrough to shard zero .get("/v1/tenant/:tenant_id", |r| { - tenant_service_handler(r, handle_tenant_timeline_passthrough) + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_passthrough"), + ) }) // Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future // timeline GET APIs will be implicitly included. .get("/v1/tenant/:tenant_id/timeline*", |r| { - tenant_service_handler(r, handle_tenant_timeline_passthrough) + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_timeline_passthrough"), + ) }) } diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs index 4aff29f15b..8bcd5c0ac4 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/control_plane/attachment_service/src/lib.rs @@ -8,6 +8,7 @@ pub mod http; mod id_lock_map; pub mod metrics; mod node; +mod pageserver_client; pub mod persistence; mod reconciler; mod scheduler; diff --git a/control_plane/attachment_service/src/metrics.rs b/control_plane/attachment_service/src/metrics.rs index ffe093b9c8..ccf5e9b07c 100644 --- a/control_plane/attachment_service/src/metrics.rs +++ b/control_plane/attachment_service/src/metrics.rs @@ -1,32 +1,284 @@ -use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; +//! +//! This module provides metric definitions for the storage controller. +//! +//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds +//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`] +//! constant. +//! +//! The rest of the code defines label group types and deals with converting outer types to labels. +//! +use bytes::Bytes; +use measured::{ + label::{LabelValue, StaticLabelSet}, + FixedCardinalityLabel, MetricGroup, +}; use once_cell::sync::Lazy; +use std::sync::Mutex; -pub(crate) struct ReconcilerMetrics { - pub(crate) spawned: IntCounter, - pub(crate) complete: IntCounterVec, -} +use crate::persistence::{DatabaseError, DatabaseOperation}; -impl ReconcilerMetrics { - // Labels used on [`Self::complete`] - pub(crate) const SUCCESS: &'static str = "ok"; - pub(crate) const ERROR: &'static str = "success"; - pub(crate) const CANCEL: &'static str = "cancel"; -} - -pub(crate) static RECONCILER: Lazy = Lazy::new(|| ReconcilerMetrics { - spawned: register_int_counter!( - "storage_controller_reconcile_spawn", - "Count of how many times we spawn a reconcile task", - ) - .expect("failed to define a metric"), - complete: register_int_counter_vec!( - "storage_controller_reconcile_complete", - "Reconciler tasks completed, broken down by success/failure/cancelled", - &["status"], - ) - .expect("failed to define a metric"), -}); +pub(crate) static METRICS_REGISTRY: Lazy = + Lazy::new(StorageControllerMetrics::default); pub fn preinitialize_metrics() { - Lazy::force(&RECONCILER); + Lazy::force(&METRICS_REGISTRY); +} + +pub(crate) struct StorageControllerMetrics { + pub(crate) metrics_group: StorageControllerMetricGroup, + encoder: Mutex, +} + +#[derive(measured::MetricGroup)] +pub(crate) struct StorageControllerMetricGroup { + /// Count of how many times we spawn a reconcile task + pub(crate) storage_controller_reconcile_spawn: measured::Counter, + /// Reconciler tasks completed, broken down by success/failure/cancelled + pub(crate) storage_controller_reconcile_complete: + measured::CounterVec, + + /// HTTP request status counters for handled requests + pub(crate) storage_controller_http_request_status: + measured::CounterVec, + /// HTTP request handler latency across all status codes + pub(crate) storage_controller_http_request_latency: + measured::HistogramVec, + + /// Count of HTTP requests to the pageserver that resulted in an error, + /// broken down by the pageserver node id, request name and method + pub(crate) storage_controller_pageserver_request_error: + measured::CounterVec, + + /// Latency of HTTP requests to the pageserver, broken down by pageserver + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + pub(crate) storage_controller_pageserver_request_latency: + measured::HistogramVec, + + /// Count of pass-through HTTP requests to the pageserver that resulted in an error, + /// broken down by the pageserver node id, request name and method + pub(crate) storage_controller_passthrough_request_error: + measured::CounterVec, + + /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + pub(crate) storage_controller_passthrough_request_latency: + measured::HistogramVec, + + /// Count of errors in database queries, broken down by error type and operation. + pub(crate) storage_controller_database_query_error: + measured::CounterVec, + + /// Latency of database queries, broken down by operation. + pub(crate) storage_controller_database_query_latency: + measured::HistogramVec, +} + +impl StorageControllerMetrics { + pub(crate) fn encode(&self) -> Bytes { + let mut encoder = self.encoder.lock().unwrap(); + self.metrics_group.collect_into(&mut *encoder); + encoder.finish() + } +} + +impl Default for StorageControllerMetrics { + fn default() -> Self { + Self { + metrics_group: StorageControllerMetricGroup::new(), + encoder: Mutex::new(measured::text::TextEncoder::new()), + } + } +} + +impl StorageControllerMetricGroup { + pub(crate) fn new() -> Self { + Self { + storage_controller_reconcile_spawn: measured::Counter::new(), + storage_controller_reconcile_complete: measured::CounterVec::new( + ReconcileCompleteLabelGroupSet { + status: StaticLabelSet::new(), + }, + ), + storage_controller_http_request_status: measured::CounterVec::new( + HttpRequestStatusLabelGroupSet { + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + status: StaticLabelSet::new(), + }, + ), + storage_controller_http_request_latency: measured::HistogramVec::new( + measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), + ), + storage_controller_pageserver_request_error: measured::CounterVec::new( + PageserverRequestLabelGroupSet { + pageserver_id: lasso::ThreadedRodeo::new(), + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + }, + ), + storage_controller_pageserver_request_latency: measured::HistogramVec::new( + measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), + ), + storage_controller_passthrough_request_error: measured::CounterVec::new( + PageserverRequestLabelGroupSet { + pageserver_id: lasso::ThreadedRodeo::new(), + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + }, + ), + storage_controller_passthrough_request_latency: measured::HistogramVec::new( + measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), + ), + storage_controller_database_query_error: measured::CounterVec::new( + DatabaseQueryErrorLabelGroupSet { + operation: StaticLabelSet::new(), + error_type: StaticLabelSet::new(), + }, + ), + storage_controller_database_query_latency: measured::HistogramVec::new( + measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0), + ), + } + } +} + +#[derive(measured::LabelGroup)] +#[label(set = ReconcileCompleteLabelGroupSet)] +pub(crate) struct ReconcileCompleteLabelGroup { + pub(crate) status: ReconcileOutcome, +} + +#[derive(measured::LabelGroup)] +#[label(set = HttpRequestStatusLabelGroupSet)] +pub(crate) struct HttpRequestStatusLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo)] + pub(crate) path: &'a str, + pub(crate) method: Method, + pub(crate) status: StatusCode, +} + +#[derive(measured::LabelGroup)] +#[label(set = HttpRequestLatencyLabelGroupSet)] +pub(crate) struct HttpRequestLatencyLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + +impl Default for HttpRequestLatencyLabelGroupSet { + fn default() -> Self { + Self { + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + } + } +} + +#[derive(measured::LabelGroup, Clone)] +#[label(set = PageserverRequestLabelGroupSet)] +pub(crate) struct PageserverRequestLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo)] + pub(crate) pageserver_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + +impl Default for PageserverRequestLabelGroupSet { + fn default() -> Self { + Self { + pageserver_id: lasso::ThreadedRodeo::new(), + path: lasso::ThreadedRodeo::new(), + method: StaticLabelSet::new(), + } + } +} + +#[derive(measured::LabelGroup)] +#[label(set = DatabaseQueryErrorLabelGroupSet)] +pub(crate) struct DatabaseQueryErrorLabelGroup { + pub(crate) error_type: DatabaseErrorLabel, + pub(crate) operation: DatabaseOperation, +} + +#[derive(measured::LabelGroup)] +#[label(set = DatabaseQueryLatencyLabelGroupSet)] +pub(crate) struct DatabaseQueryLatencyLabelGroup { + pub(crate) operation: DatabaseOperation, +} + +#[derive(FixedCardinalityLabel)] +pub(crate) enum ReconcileOutcome { + #[label(rename = "ok")] + Success, + Error, + Cancel, +} + +#[derive(FixedCardinalityLabel, Clone)] +pub(crate) enum Method { + Get, + Put, + Post, + Delete, + Other, +} + +impl From for Method { + fn from(value: hyper::Method) -> Self { + if value == hyper::Method::GET { + Method::Get + } else if value == hyper::Method::PUT { + Method::Put + } else if value == hyper::Method::POST { + Method::Post + } else if value == hyper::Method::DELETE { + Method::Delete + } else { + Method::Other + } + } +} + +pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); + +impl LabelValue for StatusCode { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0.as_u16() as u64) + } +} + +impl FixedCardinalityLabel for StatusCode { + fn cardinality() -> usize { + (100..1000).len() + } + + fn encode(&self) -> usize { + self.0.as_u16() as usize + } + + fn decode(value: usize) -> Self { + Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap()) + } +} + +#[derive(FixedCardinalityLabel)] +pub(crate) enum DatabaseErrorLabel { + Query, + Connection, + ConnectionPool, + Logical, +} + +impl DatabaseError { + pub(crate) fn error_label(&self) -> DatabaseErrorLabel { + match self { + Self::Query(_) => DatabaseErrorLabel::Query, + Self::Connection(_) => DatabaseErrorLabel::Connection, + Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool, + Self::Logical(_) => DatabaseErrorLabel::Logical, + } + } } diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs index 4167782715..df40bff66f 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/control_plane/attachment_service/src/node.rs @@ -12,7 +12,9 @@ use serde::Serialize; use tokio_util::sync::CancellationToken; use utils::{backoff, id::NodeId}; -use crate::{persistence::NodePersistence, scheduler::MaySchedule}; +use crate::{ + pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule, +}; /// Represents the in-memory description of a Node. /// @@ -202,7 +204,7 @@ impl Node { cancel: &CancellationToken, ) -> Option> where - O: FnMut(mgmt_api::Client) -> F, + O: FnMut(PageserverClient) -> F, F: std::future::Future>, { fn is_fatal(e: &mgmt_api::Error) -> bool { @@ -224,8 +226,12 @@ impl Node { .build() .expect("Failed to construct HTTP client"); - let client = - mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref()); + let client = PageserverClient::from_client( + self.get_id(), + http_client, + self.base_url(), + jwt.as_deref(), + ); let node_cancel_fut = self.cancel.cancelled(); diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/control_plane/attachment_service/src/pageserver_client.rs new file mode 100644 index 0000000000..8237229d7b --- /dev/null +++ b/control_plane/attachment_service/src/pageserver_client.rs @@ -0,0 +1,203 @@ +use pageserver_api::{ + models::{ + LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, + }, + shard::TenantShardId, +}; +use pageserver_client::mgmt_api::{Client, Result}; +use reqwest::StatusCode; +use utils::id::{NodeId, TimelineId}; + +/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage +/// controller to collect metrics in a non-intrusive manner. +#[derive(Debug, Clone)] +pub(crate) struct PageserverClient { + inner: Client, + node_id_label: String, +} + +macro_rules! measured_request { + ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{ + let labels = crate::metrics::PageserverRequestLabelGroup { + pageserver_id: $node_id, + path: $name, + method: $method, + }; + + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_latency; + let _timer_guard = latency.start_timer(labels.clone()); + + let res = $invoke; + + if res.is_err() { + let error_counters = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_error; + error_counters.inc(labels) + } + + res + }}; +} + +impl PageserverClient { + pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self { + inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) fn from_client( + node_id: NodeId, + raw_client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ) -> Self { + Self { + inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result { + measured_request!( + "tenant", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner.tenant_delete(tenant_shard_id).await + ) + } + + pub(crate) async fn tenant_time_travel_remote_storage( + &self, + tenant_shard_id: TenantShardId, + timestamp: &str, + done_if_after: &str, + ) -> Result<()> { + measured_request!( + "tenant_time_travel_remote_storage", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after) + .await + ) + } + + pub(crate) async fn tenant_secondary_download( + &self, + tenant_id: TenantShardId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress)> { + measured_request!( + "tenant_secondary_download", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.tenant_secondary_download(tenant_id, wait).await + ) + } + + pub(crate) async fn location_config( + &self, + tenant_shard_id: TenantShardId, + config: LocationConfig, + flush_ms: Option, + lazy: bool, + ) -> Result<()> { + measured_request!( + "location_config", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .location_config(tenant_shard_id, config, flush_ms, lazy) + .await + ) + } + + pub(crate) async fn list_location_config(&self) -> Result { + measured_request!( + "location_configs", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.list_location_config().await + ) + } + + pub(crate) async fn get_location_config( + &self, + tenant_shard_id: TenantShardId, + ) -> Result> { + measured_request!( + "location_config", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.get_location_config(tenant_shard_id).await + ) + } + + pub(crate) async fn timeline_create( + &self, + tenant_shard_id: TenantShardId, + req: &TimelineCreateRequest, + ) -> Result { + measured_request!( + "timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.timeline_create(tenant_shard_id, req).await + ) + } + + pub(crate) async fn timeline_delete( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner + .timeline_delete(tenant_shard_id, timeline_id) + .await + ) + } + + pub(crate) async fn tenant_shard_split( + &self, + tenant_shard_id: TenantShardId, + req: TenantShardSplitRequest, + ) -> Result { + measured_request!( + "tenant_shard_split", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner.tenant_shard_split(tenant_shard_id, req).await + ) + } + + pub(crate) async fn timeline_list( + &self, + tenant_shard_id: &TenantShardId, + ) -> Result> { + measured_request!( + "timelines", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.timeline_list(tenant_shard_id).await + ) + } + + pub(crate) async fn get_utilization(&self) -> Result { + measured_request!( + "utilization", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.get_utilization().await + ) + } +} diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs index 209d8ff075..dafd52017b 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/control_plane/attachment_service/src/persistence.rs @@ -19,6 +19,9 @@ use serde::{Deserialize, Serialize}; use utils::generation::Generation; use utils::id::{NodeId, TenantId}; +use crate::metrics::{ + DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY, +}; use crate::node::Node; /// ## What do we store? @@ -75,6 +78,25 @@ pub(crate) enum DatabaseError { Logical(String), } +#[derive(measured::FixedCardinalityLabel, Clone)] +pub(crate) enum DatabaseOperation { + InsertNode, + UpdateNode, + DeleteNode, + ListNodes, + BeginShardSplit, + CompleteShardSplit, + AbortShardSplit, + Detach, + ReAttach, + IncrementGeneration, + ListTenantShards, + InsertTenantShards, + UpdateTenantShard, + DeleteTenant, + UpdateTenantConfig, +} + #[must_use] pub(crate) enum AbortShardSplitStatus { /// We aborted the split in the database by reverting to the parent shards @@ -115,6 +137,34 @@ impl Persistence { } } + /// Wraps `with_conn` in order to collect latency and error metrics + async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult + where + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + R: Send + 'static, + { + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_latency; + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { + operation: op.clone(), + }); + + let res = self.with_conn(func).await; + + if let Err(err) = &res { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_error; + error_counter.inc(DatabaseQueryErrorLabelGroup { + error_type: err.error_label(), + operation: op, + }) + } + + res + } + /// Call the provided function in a tokio blocking thread, with a Diesel database connection. async fn with_conn(&self, func: F) -> DatabaseResult where @@ -130,21 +180,27 @@ impl Persistence { /// When a node is first registered, persist it before using it for anything pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { let np = node.to_persistent(); - self.with_conn(move |conn| -> DatabaseResult<()> { - diesel::insert_into(crate::schema::nodes::table) - .values(&np) - .execute(conn)?; - Ok(()) - }) + self.with_measured_conn( + DatabaseOperation::InsertNode, + move |conn| -> DatabaseResult<()> { + diesel::insert_into(crate::schema::nodes::table) + .values(&np) + .execute(conn)?; + Ok(()) + }, + ) .await } /// At startup, populate the list of nodes which our shards may be placed on pub(crate) async fn list_nodes(&self) -> DatabaseResult> { let nodes: Vec = self - .with_conn(move |conn| -> DatabaseResult<_> { - Ok(crate::schema::nodes::table.load::(conn)?) - }) + .with_measured_conn( + DatabaseOperation::ListNodes, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::nodes::table.load::(conn)?) + }, + ) .await?; tracing::info!("list_nodes: loaded {} nodes", nodes.len()); @@ -159,7 +215,7 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; let updated = self - .with_conn(move |conn| { + .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { let updated = diesel::update(nodes) .filter(node_id.eq(input_node_id.0 as i64)) .set((scheduling_policy.eq(String::from(input_scheduling)),)) @@ -181,9 +237,12 @@ impl Persistence { /// be enriched at runtime with state discovered on pageservers. pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { let loaded = self - .with_conn(move |conn| -> DatabaseResult<_> { - Ok(crate::schema::tenant_shards::table.load::(conn)?) - }) + .with_measured_conn( + DatabaseOperation::ListTenantShards, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }, + ) .await?; if loaded.is_empty() { @@ -260,17 +319,20 @@ impl Persistence { shards: Vec, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> QueryResult<()> { - for tenant in &shards { - diesel::insert_into(tenant_shards) - .values(tenant) - .execute(conn)?; - } + self.with_measured_conn( + DatabaseOperation::InsertTenantShards, + move |conn| -> DatabaseResult<()> { + conn.transaction(|conn| -> QueryResult<()> { + for tenant in &shards { + diesel::insert_into(tenant_shards) + .values(tenant) + .execute(conn)?; + } + Ok(()) + })?; Ok(()) - })?; - Ok(()) - }) + }, + ) .await } @@ -278,25 +340,31 @@ impl Persistence { /// the tenant from memory on this server. pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - diesel::delete(tenant_shards) - .filter(tenant_id.eq(del_tenant_id.to_string())) - .execute(conn)?; + self.with_measured_conn( + DatabaseOperation::DeleteTenant, + move |conn| -> DatabaseResult<()> { + diesel::delete(tenant_shards) + .filter(tenant_id.eq(del_tenant_id.to_string())) + .execute(conn)?; - Ok(()) - }) + Ok(()) + }, + ) .await } pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { use crate::schema::nodes::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - diesel::delete(nodes) - .filter(node_id.eq(del_node_id.0 as i64)) - .execute(conn)?; + self.with_measured_conn( + DatabaseOperation::DeleteNode, + move |conn| -> DatabaseResult<()> { + diesel::delete(nodes) + .filter(node_id.eq(del_node_id.0 as i64)) + .execute(conn)?; - Ok(()) - }) + Ok(()) + }, + ) .await } @@ -310,7 +378,7 @@ impl Persistence { ) -> DatabaseResult> { use crate::schema::tenant_shards::dsl::*; let updated = self - .with_conn(move |conn| { + .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { let rows_updated = diesel::update(tenant_shards) .filter(generation_pageserver.eq(node_id.0 as i64)) .set(generation.eq(generation + 1)) @@ -360,7 +428,7 @@ impl Persistence { ) -> anyhow::Result { use crate::schema::tenant_shards::dsl::*; let updated = self - .with_conn(move |conn| { + .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| { let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) @@ -404,7 +472,7 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| { + self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { let query = diesel::update(tenant_shards) .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) @@ -445,7 +513,7 @@ impl Persistence { ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| { + self.with_measured_conn(DatabaseOperation::UpdateTenantConfig, move |conn| { diesel::update(tenant_shards) .filter(tenant_id.eq(input_tenant_id.to_string())) .set((config.eq(serde_json::to_string(&input_config).unwrap()),)) @@ -460,7 +528,7 @@ impl Persistence { pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| { + self.with_measured_conn(DatabaseOperation::Detach, move |conn| { let updated = diesel::update(tenant_shards) .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) @@ -490,7 +558,7 @@ impl Persistence { parent_to_children: Vec<(TenantShardId, Vec)>, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { + self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { conn.transaction(|conn| -> DatabaseResult<()> { // Mark parent shards as splitting @@ -554,26 +622,29 @@ impl Persistence { old_shard_count: ShardCount, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult<()> { - conn.transaction(|conn| -> QueryResult<()> { - // Drop parent shards - diesel::delete(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(old_shard_count.literal() as i32)) - .execute(conn)?; + self.with_measured_conn( + DatabaseOperation::CompleteShardSplit, + move |conn| -> DatabaseResult<()> { + conn.transaction(|conn| -> QueryResult<()> { + // Drop parent shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .execute(conn)?; - // Clear sharding flag - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .set((splitting.eq(0),)) - .execute(conn)?; - debug_assert!(updated > 0); + // Clear sharding flag + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .set((splitting.eq(0),)) + .execute(conn)?; + debug_assert!(updated > 0); + + Ok(()) + })?; Ok(()) - })?; - - Ok(()) - }) + }, + ) .await } @@ -585,40 +656,44 @@ impl Persistence { new_shard_count: ShardCount, ) -> DatabaseResult { use crate::schema::tenant_shards::dsl::*; - self.with_conn(move |conn| -> DatabaseResult { - let aborted = conn.transaction(|conn| -> DatabaseResult { - // Clear the splitting state on parent shards - let updated = diesel::update(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.ne(new_shard_count.literal() as i32)) - .set((splitting.eq(0),)) - .execute(conn)?; + self.with_measured_conn( + DatabaseOperation::AbortShardSplit, + move |conn| -> DatabaseResult { + let aborted = + conn.transaction(|conn| -> DatabaseResult { + // Clear the splitting state on parent shards + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.ne(new_shard_count.literal() as i32)) + .set((splitting.eq(0),)) + .execute(conn)?; - // Parent shards are already gone: we cannot abort. - if updated == 0 { - return Ok(AbortShardSplitStatus::Complete); - } + // Parent shards are already gone: we cannot abort. + if updated == 0 { + return Ok(AbortShardSplitStatus::Complete); + } - // Sanity check: if parent shards were present, their cardinality should - // be less than the number of child shards. - if updated >= new_shard_count.count() as usize { - return Err(DatabaseError::Logical(format!( - "Unexpected parent shard count {updated} while aborting split to \ + // Sanity check: if parent shards were present, their cardinality should + // be less than the number of child shards. + if updated >= new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected parent shard count {updated} while aborting split to \ count {new_shard_count:?} on tenant {split_tenant_id}" - ))); - } + ))); + } - // Erase child shards - diesel::delete(tenant_shards) - .filter(tenant_id.eq(split_tenant_id.to_string())) - .filter(shard_count.eq(new_shard_count.literal() as i32)) - .execute(conn)?; + // Erase child shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) + .execute(conn)?; - Ok(AbortShardSplitStatus::Aborted) - })?; + Ok(AbortShardSplitStatus::Aborted) + })?; - Ok(aborted) - }) + Ok(aborted) + }, + ) .await } } diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs index f00f35c74b..a62357f9ac 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/control_plane/attachment_service/src/reconciler.rs @@ -1,3 +1,4 @@ +use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::service; use hyper::StatusCode; @@ -117,6 +118,15 @@ impl Reconciler { flush_ms: Option, lazy: bool, ) -> Result<(), ReconcileError> { + if !node.is_available() && config.mode == LocationConfigMode::Detached { + // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline + // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of + // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`] + tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation"); + self.observed.locations.remove(&node.get_id()); + return Ok(()); + } + self.observed .locations .insert(node.get_id(), ObservedStateLocation { conf: None }); @@ -149,9 +159,16 @@ impl Reconciler { }; tracing::info!("location_config({node}) complete: {:?}", config); - self.observed - .locations - .insert(node.get_id(), ObservedStateLocation { conf: Some(config) }); + match config.mode { + LocationConfigMode::Detached => { + self.observed.locations.remove(&node.get_id()); + } + _ => { + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: Some(config) }); + } + } Ok(()) } @@ -243,8 +260,11 @@ impl Reconciler { tenant_shard_id: TenantShardId, node: &Node, ) -> anyhow::Result> { - let client = - mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.service_config.jwt_token.as_deref(), + ); let timelines = client.timeline_list(&tenant_shard_id).await?; Ok(timelines diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs index e38007c7af..aa930014b2 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/control_plane/attachment_service/src/service.rs @@ -27,6 +27,7 @@ use pageserver_api::{ models::{SecondaryProgress, TenantConfigRequest}, }; +use crate::pageserver_client::PageserverClient; use pageserver_api::{ models::{ self, LocationConfig, LocationConfigListResponse, LocationConfigMode, @@ -209,6 +210,7 @@ struct ShardSplitParams { new_stripe_size: Option, targets: Vec, policy: PlacementPolicy, + config: TenantConfig, shard_ident: ShardIdentity, } @@ -551,7 +553,11 @@ impl Service { break; } - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); match client .location_config( tenant_shard_id, @@ -1388,7 +1394,8 @@ impl Service { incremented_generations.len() ); - // Apply the updated generation to our in-memory state + // Apply the updated generation to our in-memory state, and + // gather discover secondary locations. let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); @@ -1396,62 +1403,65 @@ impl Service { tenants: Vec::new(), }; - for (tenant_shard_id, new_gen) in incremented_generations { - response.tenants.push(ReAttachResponseTenant { - id: tenant_shard_id, - gen: new_gen.into().unwrap(), - }); - // Apply the new generation number to our in-memory state - let shard_state = tenants.get_mut(&tenant_shard_id); - let Some(shard_state) = shard_state else { - // Not fatal. This edge case requires a re-attach to happen - // between inserting a new tenant shard in to the database, and updating our in-memory - // state to know about the shard, _and_ that the state inserted to the database referenced - // a pageserver. Should never happen, but handle it rather than panicking, since it should - // be harmless. - tracing::error!( - "Shard {} is in database for node {} but not in-memory state", - tenant_shard_id, - reattach_req.node_id - ); - continue; - }; + // TODO: cancel/restart any running reconciliation for this tenant, it might be trying + // to call location_conf API with an old generation. Wait for cancellation to complete + // before responding to this request. Requires well implemented CancellationToken logic + // all the way to where we call location_conf. Even then, there can still be a location_conf + // request in flight over the network: TODO handle that by making location_conf API refuse + // to go backward in generations. - // If [`Persistence::re_attach`] selected this shard, it must have alread - // had a generation set. - debug_assert!(shard_state.generation.is_some()); - let Some(old_gen) = shard_state.generation else { - // Should never happen: would only return incremented generation - // for a tenant that already had a non-null generation. - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Generation must be set while re-attaching" - ))); - }; - shard_state.generation = Some(std::cmp::max(old_gen, new_gen)); - if let Some(observed) = shard_state - .observed - .locations - .get_mut(&reattach_req.node_id) - { - if let Some(conf) = observed.conf.as_mut() { - conf.generation = new_gen.into(); + // Scan through all shards, applying updates for ones where we updated generation + // and identifying shards that intend to have a secondary location on this node. + for (tenant_shard_id, shard) in tenants { + if let Some(new_gen) = incremented_generations.get(tenant_shard_id) { + let new_gen = *new_gen; + response.tenants.push(ReAttachResponseTenant { + id: *tenant_shard_id, + gen: Some(new_gen.into().unwrap()), + // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`] + // execution. If a pageserver is restarted during that process, then the reconcile pass will + // fail, and start from scratch, so it doesn't make sense for us to try and preserve + // the stale/multi states at this point. + mode: LocationConfigMode::AttachedSingle, + }); + + shard.generation = std::cmp::max(shard.generation, Some(new_gen)); + if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) { + // Why can we update `observed` even though we're not sure our response will be received + // by the pageserver? Because the pageserver will not proceed with startup until + // it has processed response: if it loses it, we'll see another request and increment + // generation again, avoiding any uncertainty about dirtiness of tenant's state. + if let Some(conf) = observed.conf.as_mut() { + conf.generation = new_gen.into(); + } + } else { + // This node has no observed state for the shard: perhaps it was offline + // when the pageserver restarted. Insert a None, so that the Reconciler + // will be prompted to learn the location's state before it makes changes. + shard + .observed + .locations + .insert(reattach_req.node_id, ObservedStateLocation { conf: None }); } - } else { - // This node has no observed state for the shard: perhaps it was offline - // when the pageserver restarted. Insert a None, so that the Reconciler - // will be prompted to learn the location's state before it makes changes. - shard_state - .observed - .locations - .insert(reattach_req.node_id, ObservedStateLocation { conf: None }); - } + } else if shard.intent.get_secondary().contains(&reattach_req.node_id) { + // Ordering: pageserver will not accept /location_config requests until it has + // finished processing the response from re-attach. So we can update our in-memory state + // now, and be confident that we are not stamping on the result of some later location config. + // TODO: however, we are not strictly ordered wrt ReconcileResults queue, + // so we might update observed state here, and then get over-written by some racing + // ReconcileResult. The impact is low however, since we have set state on pageserver something + // that matches intent, so worst case if we race then we end up doing a spurious reconcile. - // TODO: cancel/restart any running reconciliation for this tenant, it might be trying - // to call location_conf API with an old generation. Wait for cancellation to complete - // before responding to this request. Requires well implemented CancellationToken logic - // all the way to where we call location_conf. Even then, there can still be a location_conf - // request in flight over the network: TODO handle that by making location_conf API refuse - // to go backward in generations. + response.tenants.push(ReAttachResponseTenant { + id: *tenant_shard_id, + gen: None, + mode: LocationConfigMode::Secondary, + }); + + // We must not update observed, because we have no guarantee that our + // response will be received by the pageserver. This could leave it + // falsely dirty, but the resulting reconcile should be idempotent. + } } // We consider a node Active once we have composed a re-attach response, but we @@ -2096,8 +2106,11 @@ impl Service { }) .collect::>(); for tenant_shard_id in shard_ids { - let client = - mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -2149,7 +2162,11 @@ impl Service { // Issue concurrent requests to all shards' locations let mut futs = FuturesUnordered::new(); for (tenant_shard_id, node) in targets { - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); futs.push(async move { let result = client .tenant_secondary_download(tenant_shard_id, wait) @@ -2242,7 +2259,11 @@ impl Service { // Phase 1: delete on the pageservers let mut any_pending = false; for (tenant_shard_id, node) in targets { - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not // surface immediately as an error to our caller. let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| { @@ -2354,7 +2375,7 @@ impl Service { tenant_shard_id, create_req.new_timeline_id, ); - let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); client .timeline_create(tenant_shard_id, &create_req) @@ -2478,7 +2499,7 @@ impl Service { "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", ); - let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref()); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); client .timeline_delete(tenant_shard_id, timeline_id) .await @@ -2519,11 +2540,11 @@ impl Service { } /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this - /// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound) - pub(crate) fn tenant_shard0_baseurl( + /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) + pub(crate) fn tenant_shard0_node( &self, tenant_id: TenantId, - ) -> Result<(String, TenantShardId), ApiError> { + ) -> Result<(Node, TenantShardId), ApiError> { let locked = self.inner.read().unwrap(); let Some((tenant_shard_id, shard)) = locked .tenants @@ -2555,7 +2576,7 @@ impl Service { ))); }; - Ok((node.base_url(), *tenant_shard_id)) + Ok((node.clone(), *tenant_shard_id)) } pub(crate) fn tenant_locate( @@ -2725,7 +2746,7 @@ impl Service { let detach_locations: Vec<(Node, TenantShardId)> = { let mut detach_locations = Vec::new(); let mut locked = self.inner.write().unwrap(); - let (nodes, tenants, _scheduler) = locked.parts_mut(); + let (nodes, tenants, scheduler) = locked.parts_mut(); for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(op.tenant_id)) @@ -2758,6 +2779,13 @@ impl Service { tracing::info!("Restoring parent shard {tenant_shard_id}"); shard.splitting = SplitState::Idle; + if let Err(e) = shard.schedule(scheduler) { + // If this shard can't be scheduled now (perhaps due to offline nodes or + // capacity issues), that must not prevent us rolling back a split. In this + // case it should be eventually scheduled in the background. + tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}") + } + self.maybe_reconcile_shard(shard, nodes); } @@ -2849,7 +2877,7 @@ impl Service { .map(|(shard_id, _)| *shard_id) .collect::>(); - let (_nodes, tenants, scheduler) = locked.parts_mut(); + let (nodes, tenants, scheduler) = locked.parts_mut(); for parent_id in parent_ids { let child_ids = parent_id.split(new_shard_count); @@ -2916,6 +2944,8 @@ impl Service { // find a secondary (e.g. because cluster is overloaded). tracing::warn!("Failed to schedule child shard {child}: {e}"); } + // In the background, attach secondary locations for the new shards + self.maybe_reconcile_shard(&mut child_state, nodes); tenants.insert(child, child_state); response.new_shards.push(child); @@ -2980,6 +3010,7 @@ impl Service { ))); let mut policy = None; + let mut config = None; let mut shard_ident = None; // Validate input, and calculate which shards we will create let (old_shard_count, targets) = @@ -3036,6 +3067,9 @@ impl Service { if shard_ident.is_none() { shard_ident = Some(shard.shard); } + if config.is_none() { + config = Some(shard.config.clone()); + } if tenant_shard_id.shard_count.count() == split_req.new_shard_count { tracing::info!( @@ -3054,8 +3088,6 @@ impl Service { .get(&node_id) .expect("Pageservers may not be deleted while referenced"); - // TODO: if any reconciliation is currently in progress for this shard, wait for it. - targets.push(ShardSplitTarget { parent_id: *tenant_shard_id, node: node.clone(), @@ -3098,6 +3130,7 @@ impl Service { shard_ident.unwrap() }; let policy = policy.unwrap(); + let config = config.unwrap(); Ok(ShardSplitAction::Split(ShardSplitParams { old_shard_count, @@ -3105,6 +3138,7 @@ impl Service { new_stripe_size: split_req.new_stripe_size, targets, policy, + config, shard_ident, })) } @@ -3124,11 +3158,49 @@ impl Service { old_shard_count, new_shard_count, new_stripe_size, - targets, + mut targets, policy, + config, shard_ident, } = params; + // Drop any secondary locations: pageservers do not support splitting these, and in any case the + // end-state for a split tenant will usually be to have secondary locations on different nodes. + // The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation + // at the time of split. + let waiters = { + let mut locked = self.inner.write().unwrap(); + let mut waiters = Vec::new(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for target in &mut targets { + let Some(shard) = tenants.get_mut(&target.parent_id) else { + // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard {} not found", + target.parent_id + ))); + }; + + if shard.intent.get_attached() != &Some(target.node.get_id()) { + // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID. + return Err(ApiError::Conflict(format!( + "Shard {} unexpectedly rescheduled during split", + target.parent_id + ))); + } + + // Irrespective of PlacementPolicy, clear secondary locations from intent + shard.intent.clear_secondary(scheduler); + + // Run Reconciler to execute detach fo secondary locations. + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + waiters.push(waiter); + } + } + waiters + }; + self.await_waiters(waiters, RECONCILE_TIMEOUT).await?; + // Before creating any new child shards in memory or on the pageservers, persist them: this // enables us to ensure that we will always be able to clean up if something goes wrong. This also // acts as the protection against two concurrent attempts to split: one of them will get a database @@ -3157,8 +3229,7 @@ impl Service { generation: None, generation_pageserver: Some(target.node.get_id().0 as i64), placement_policy: serde_json::to_string(&policy).unwrap(), - // TODO: get the config out of the map - config: serde_json::to_string(&TenantConfig::default()).unwrap(), + config: serde_json::to_string(&config).unwrap(), splitting: SplitState::Splitting, }); } @@ -3215,7 +3286,11 @@ impl Service { node, child_ids, } = target; - let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref()); + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); let response = client .tenant_shard_split( *parent_id, @@ -3343,6 +3418,11 @@ impl Service { // If we were already attached to something, demote that to a secondary if let Some(old_attached) = old_attached { if n > 0 { + // Remove other secondaries to make room for the location we'll demote + while shard.intent.get_secondary().len() >= n { + shard.intent.pop_secondary(scheduler); + } + shard.intent.push_secondary(scheduler, old_attached); } } @@ -3370,7 +3450,7 @@ impl Service { if let Some(waiter) = waiter { waiter.wait_timeout(RECONCILE_TIMEOUT).await?; } else { - tracing::warn!("Migration is a no-op"); + tracing::info!("Migration is a no-op"); } Ok(TenantShardMigrateResponse {}) diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs index 9dd368bf41..83c921dc58 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/control_plane/attachment_service/src/tenant_state.rs @@ -4,7 +4,10 @@ use std::{ time::Duration, }; -use crate::{metrics, persistence::TenantShardPersistence}; +use crate::{ + metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, + persistence::TenantShardPersistence, +}; use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::{ models::{LocationConfig, LocationConfigMode, TenantConfig}, @@ -718,7 +721,10 @@ impl TenantState { let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, tenant_id=%reconciler.tenant_shard_id.tenant_id, shard_id=%reconciler.tenant_shard_id.shard_slug()); - metrics::RECONCILER.spawned.inc(); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_spawn + .inc(); let result_tx = result_tx.clone(); let join_handle = tokio::task::spawn( async move { @@ -736,10 +742,12 @@ impl TenantState { // TODO: wrap all remote API operations in cancellation check // as well. if reconciler.cancel.is_cancelled() { - metrics::RECONCILER - .complete - .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]) - .inc(); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: ReconcileOutcome::Cancel, + }); return; } @@ -754,18 +762,18 @@ impl TenantState { } // Update result counter - match &result { - Ok(_) => metrics::RECONCILER - .complete - .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]), - Err(ReconcileError::Cancel) => metrics::RECONCILER - .complete - .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]), - Err(_) => metrics::RECONCILER - .complete - .with_label_values(&[metrics::ReconcilerMetrics::ERROR]), - } - .inc(); + let outcome_label = match &result { + Ok(_) => ReconcileOutcome::Success, + Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, + Err(_) => ReconcileOutcome::Error, + }; + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: outcome_label, + }); result_tx .send(ReconcileResult { diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 5472948091..2e88836bd0 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,7 +6,9 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId}; +use crate::{ + controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId, +}; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. @@ -20,12 +22,20 @@ pub struct ReAttachRequest { pub register: Option, } -#[derive(Serialize, Deserialize)] -pub struct ReAttachResponseTenant { - pub id: TenantShardId, - pub gen: u32, +fn default_mode() -> LocationConfigMode { + LocationConfigMode::AttachedSingle } +#[derive(Serialize, Deserialize, Debug)] +pub struct ReAttachResponseTenant { + pub id: TenantShardId, + /// Mandatory if LocationConfigMode is None or set to an Attached* mode + pub gen: Option, + + /// Default value only for backward compat: this field should be set + #[serde(default = "default_mode")] + pub mode: LocationConfigMode, +} #[derive(Serialize, Deserialize)] pub struct ReAttachResponse { pub tenants: Vec, diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index a60971abf0..f8a5f68131 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter { } } -async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { SERVE_METRICS_COUNT.inc(); let started_at = std::time::Instant::now(); @@ -367,7 +367,6 @@ pub fn make_router() -> RouterBuilder { .middleware(Middleware::post_with_info( add_request_id_header_to_response, )) - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .err_handler(route_error_handler) } diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index a3aee45b58..8eee1f72a6 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -245,7 +245,7 @@ impl<'a, T> Guard<'a, T> { /// /// The permit will be on a semaphore part of the new internal value, and any following /// [`OnceCell::get_or_init`] will wait on it to complete. - pub fn take_and_deinit(&mut self) -> (T, InitPermit) { + pub fn take_and_deinit(mut self) -> (T, InitPermit) { let mut swapped = Inner::default(); let sem = swapped.init_semaphore.clone(); // acquire and forget right away, moving the control over to InitPermit @@ -543,7 +543,7 @@ mod tests { target.set(42, permit); let (_answer, permit) = { - let mut guard = target + let guard = target .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) }) .await .unwrap(); diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 9953b447c8..18b2af14f1 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,27 +1,60 @@ use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VecMapOrdering { + Greater, + GreaterOrEqual, +} + /// Ordered map datastructure implemented in a Vec. /// Append only - can only add keys that are larger than the /// current max key. +/// Ordering can be adjusted using [`VecMapOrdering`] +/// during `VecMap` construction. #[derive(Clone, Debug)] -pub struct VecMap(Vec<(K, V)>); +pub struct VecMap { + data: Vec<(K, V)>, + ordering: VecMapOrdering, +} impl Default for VecMap { fn default() -> Self { - VecMap(Default::default()) + VecMap { + data: Default::default(), + ordering: VecMapOrdering::Greater, + } } } -#[derive(Debug)] -pub struct InvalidKey; +#[derive(thiserror::Error, Debug)] +pub enum VecMapError { + #[error("Key violates ordering constraint")] + InvalidKey, + #[error("Mismatched ordering constraints")] + ExtendOrderingError, +} impl VecMap { + pub fn new(ordering: VecMapOrdering) -> Self { + Self { + data: Vec::new(), + ordering, + } + } + + pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self { + Self { + data: Vec::with_capacity(capacity), + ordering, + } + } + pub fn is_empty(&self) -> bool { - self.0.is_empty() + self.data.is_empty() } pub fn as_slice(&self) -> &[(K, V)] { - self.0.as_slice() + self.data.as_slice() } /// This function may panic if given a range where the lower bound is @@ -29,7 +62,7 @@ impl VecMap { pub fn slice_range>(&self, range: R) -> &[(K, V)] { use std::ops::Bound::*; - let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key); + let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key); let start_idx = match range.start_bound() { Unbounded => 0, @@ -41,7 +74,7 @@ impl VecMap { }; let end_idx = match range.end_bound() { - Unbounded => self.0.len(), + Unbounded => self.data.len(), Included(k) => match binary_search(k) { Ok(idx) => idx + 1, Err(idx) => idx, @@ -49,34 +82,30 @@ impl VecMap { Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity), }; - &self.0[start_idx..end_idx] + &self.data[start_idx..end_idx] } /// Add a key value pair to the map. - /// If `key` is less than or equal to the current maximum key - /// the pair will not be added and InvalidKey error will be returned. - pub fn append(&mut self, key: K, value: V) -> Result { - if let Some((last_key, _last_value)) = self.0.last() { - if &key <= last_key { - return Err(InvalidKey); - } - } + /// If `key` is not respective of the `self` ordering the + /// pair will not be added and `InvalidKey` error will be returned. + pub fn append(&mut self, key: K, value: V) -> Result { + self.validate_key_order(&key)?; let delta_size = self.instrument_vec_op(|vec| vec.push((key, value))); Ok(delta_size) } /// Update the maximum key value pair or add a new key value pair to the map. - /// If `key` is less than the current maximum key no updates or additions - /// will occur and InvalidKey error will be returned. + /// If `key` is not respective of the `self` ordering no updates or additions + /// will occur and `InvalidKey` error will be returned. pub fn append_or_update_last( &mut self, key: K, mut value: V, - ) -> Result<(Option, usize), InvalidKey> { - if let Some((last_key, last_value)) = self.0.last_mut() { + ) -> Result<(Option, usize), VecMapError> { + if let Some((last_key, last_value)) = self.data.last_mut() { match key.cmp(last_key) { - Ordering::Less => return Err(InvalidKey), + Ordering::Less => return Err(VecMapError::InvalidKey), Ordering::Equal => { std::mem::swap(last_value, &mut value); const DELTA_SIZE: usize = 0; @@ -100,40 +129,67 @@ impl VecMap { V: Clone, { let split_idx = self - .0 + .data .binary_search_by_key(&cutoff, extract_key) .unwrap_or_else(std::convert::identity); ( - VecMap(self.0[..split_idx].to_vec()), - VecMap(self.0[split_idx..].to_vec()), + VecMap { + data: self.data[..split_idx].to_vec(), + ordering: self.ordering, + }, + VecMap { + data: self.data[split_idx..].to_vec(), + ordering: self.ordering, + }, ) } /// Move items from `other` to the end of `self`, leaving `other` empty. - /// If any keys in `other` is less than or equal to any key in `self`, - /// `InvalidKey` error will be returned and no mutation will occur. - pub fn extend(&mut self, other: &mut Self) -> Result { - let self_last_opt = self.0.last().map(extract_key); - let other_first_opt = other.0.last().map(extract_key); + /// If the `other` ordering is different from `self` ordering + /// `ExtendOrderingError` error will be returned. + /// If any keys in `other` is not respective of the ordering defined in + /// `self`, `InvalidKey` error will be returned and no mutation will occur. + pub fn extend(&mut self, other: &mut Self) -> Result { + if self.ordering != other.ordering { + return Err(VecMapError::ExtendOrderingError); + } - if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) { - if self_last >= other_first { - return Err(InvalidKey); + let other_first_opt = other.data.last().map(extract_key); + if let Some(other_first) = other_first_opt { + self.validate_key_order(other_first)?; + } + + let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data)); + Ok(delta_size) + } + + /// Validate the current last key in `self` and key being + /// inserted against the order defined in `self`. + fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> { + if let Some(last_key) = self.data.last().map(extract_key) { + match (&self.ordering, &key.cmp(last_key)) { + (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => { + return Err(VecMapError::InvalidKey); + } + (VecMapOrdering::Greater, Ordering::Greater) => {} + (VecMapOrdering::GreaterOrEqual, Ordering::Less) => { + return Err(VecMapError::InvalidKey); + } + (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {} } } - let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0)); - Ok(delta_size) + Ok(()) } /// Instrument an operation on the underlying [`Vec`]. /// Will panic if the operation decreases capacity. /// Returns the increase in memory usage caused by the op. fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize { - let old_cap = self.0.capacity(); - op(&mut self.0); - let new_cap = self.0.capacity(); + let old_cap = self.data.capacity(); + op(&mut self.data); + let new_cap = self.data.capacity(); match old_cap.cmp(&new_cap) { Ordering::Less => { @@ -145,6 +201,36 @@ impl VecMap { Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"), } } + + /// Similar to `from_iter` defined in `FromIter` trait except + /// that it accepts an [`VecMapOrdering`] + pub fn from_iter>(iter: I, ordering: VecMapOrdering) -> Self { + let iter = iter.into_iter(); + let initial_capacity = { + match iter.size_hint() { + (lower_bound, None) => lower_bound, + (_, Some(upper_bound)) => upper_bound, + } + }; + + let mut vec_map = VecMap::with_capacity(initial_capacity, ordering); + for (key, value) in iter { + vec_map + .append(key, value) + .expect("The passed collection needs to be sorted!"); + } + + vec_map + } +} + +impl IntoIterator for VecMap { + type Item = (K, V); + type IntoIter = std::vec::IntoIter<(K, V)>; + + fn into_iter(self) -> Self::IntoIter { + self.data.into_iter() + } } fn extract_key(entry: &(K, V)) -> &K { @@ -155,7 +241,7 @@ fn extract_key(entry: &(K, V)) -> &K { mod tests { use std::{collections::BTreeMap, ops::Bound}; - use super::VecMap; + use super::{VecMap, VecMapOrdering}; #[test] fn unbounded_range() { @@ -310,5 +396,59 @@ mod tests { left.extend(&mut one_map).unwrap_err(); assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); assert_eq!(one_map.as_slice(), &[(1, ())]); + + let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual); + map_greater_or_equal.append(2, ()).unwrap(); + map_greater_or_equal.append(2, ()).unwrap(); + + left.extend(&mut map_greater_or_equal).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); + assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]); + } + + #[test] + fn extend_with_ordering() { + let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual); + left.append(0, ()).unwrap(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut greater_right = VecMap::new(VecMapOrdering::Greater); + greater_right.append(0, ()).unwrap(); + left.extend(&mut greater_right).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual); + greater_or_equal_right.append(2, ()).unwrap(); + greater_or_equal_right.append(2, ()).unwrap(); + left.extend(&mut greater_or_equal_right).unwrap(); + assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]); + } + + #[test] + fn vec_map_from_sorted() { + let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())]; + let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater); + assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]); + + let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]; + let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); + assert_eq!( + vec_map.as_slice(), + &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())] + ); + } + + #[test] + #[should_panic] + fn vec_map_from_unsorted_greater() { + let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())]; + let _ = VecMap::from_iter(vec, VecMapOrdering::Greater); + } + + #[test] + #[should_panic] + fn vec_map_from_unsorted_greater_or_equal() { + let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())]; + let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); } } diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 1b3d76335d..42c800822b 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -5,7 +5,8 @@ use pageserver_api::{ controller_api::NodeRegisterRequest, shard::TenantShardId, upcall_api::{ - ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateRequestTenant, ValidateResponse, }, }; use serde::{de::DeserializeOwned, Serialize}; @@ -37,7 +38,9 @@ pub trait ControlPlaneGenerationsApi { fn re_attach( &self, conf: &PageServerConf, - ) -> impl Future, RetryForeverError>> + Send; + ) -> impl Future< + Output = Result, RetryForeverError>, + > + Send; fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, @@ -118,7 +121,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { async fn re_attach( &self, conf: &PageServerConf, - ) -> Result, RetryForeverError> { + ) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") @@ -181,7 +184,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { Ok(response .tenants .into_iter() - .map(|t| (t.id, Generation::new(t.gen))) + .map(|rart| (rart.id, rart)) .collect::>()) } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index b6aea8fae8..e3c11cb299 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -724,8 +724,8 @@ impl DeletionQueue { mod test { use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::shard::ShardIndex; - use std::io::ErrorKind; + use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant}; + use std::{io::ErrorKind, time::Duration}; use tracing::info; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; @@ -834,9 +834,10 @@ mod test { async fn re_attach( &self, _conf: &PageServerConf, - ) -> Result, RetryForeverError> { + ) -> Result, RetryForeverError> { unimplemented!() } + async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 229f3ae98f..26f23fb8c2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -36,6 +36,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -2266,6 +2267,7 @@ pub fn make_router( Ok(router .data(state) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 727650a5a5..6f7d74bdee 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -34,6 +34,7 @@ use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use tracing::{debug, trace, warn}; use utils::bin_ser::DeserializeError; +use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -1546,12 +1547,13 @@ impl<'a> DatadirModification<'a> { if !self.pending_updates.is_empty() { // The put_batch call below expects expects the inputs to be sorted by Lsn, // so we do that first. - let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = self - .pending_updates - .drain() - .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val))) - .kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0) - .collect(); + let lsn_ordered_batch: VecMap = VecMap::from_iter( + self.pending_updates + .drain() + .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val)))) + .kmerge_by(|lhs, rhs| lhs.0 < rhs.0), + VecMapOrdering::GreaterOrEqual, + ); writer.put_batch(lsn_ordered_batch, ctx).await?; } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7a6ddd6a4e..fe48741a89 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -202,6 +202,13 @@ pub(super) struct AttachedTenantConf { } impl AttachedTenantConf { + fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self { + Self { + tenant_conf, + location, + } + } + fn try_from(location_conf: LocationConf) -> anyhow::Result { match &location_conf.mode { LocationMode::Attached(attach_conf) => Ok(Self { @@ -678,9 +685,20 @@ impl Tenant { } // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. + enum BrokenVerbosity { + Error, + Info + } let make_broken = - |t: &Tenant, err: anyhow::Error| { - error!("attach failed, setting tenant state to Broken: {err:?}"); + |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| { + match verbosity { + BrokenVerbosity::Info => { + info!("attach cancelled, setting tenant state to Broken: {err}"); + }, + BrokenVerbosity::Error => { + error!("attach failed, setting tenant state to Broken: {err:?}"); + } + } t.state.send_modify(|state| { // The Stopping case is for when we have passed control on to DeleteTenantFlow: // if it errors, we will call make_broken when tenant is already in Stopping. @@ -744,7 +762,7 @@ impl Tenant { // Make the tenant broken so that set_stopping will not hang waiting for it to leave // the Attaching state. This is an over-reaction (nothing really broke, the tenant is // just shutting down), but ensures progress. - make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching")); + make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info); return Ok(()); }, ) @@ -766,7 +784,7 @@ impl Tenant { match res { Ok(p) => Some(p), Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e)); + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); return Ok(()); } } @@ -790,7 +808,7 @@ impl Tenant { { Ok(should_resume_deletion) => should_resume_deletion, Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); + make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error); return Ok(()); } } @@ -820,7 +838,7 @@ impl Tenant { .await; if let Err(e) = deleted { - make_broken(&tenant_clone, anyhow::anyhow!(e)); + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); } return Ok(()); @@ -841,7 +859,7 @@ impl Tenant { tenant_clone.activate(broker_client, None, &ctx); } Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e)); + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 57fc444cdd..53a8c97e23 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -196,16 +196,17 @@ impl LocationConf { /// For use when attaching/re-attaching: update the generation stored in this /// structure. If we were in a secondary state, promote to attached (posession /// of a fresh generation implies this). - pub(crate) fn attach_in_generation(&mut self, generation: Generation) { + pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) { match &mut self.mode { LocationMode::Attached(attach_conf) => { attach_conf.generation = generation; + attach_conf.attach_mode = mode; } LocationMode::Secondary(_) => { // We are promoted to attached by the control plane's re-attach response self.mode = LocationMode::Attached(AttachedLocationConfig { generation, - attach_mode: AttachmentMode::Single, + attach_mode: mode, }) } } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index f456ca3006..97a505ded9 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,13 +2,13 @@ //! page server. use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; -use futures::stream::StreamExt; use itertools::Itertools; use pageserver_api::key::Key; -use pageserver_api::models::ShardParameters; +use pageserver_api::models::{LocationConfigMode, ShardParameters}; use pageserver_api::shard::{ ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, }; +use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::cmp::Ordering; @@ -125,6 +125,46 @@ pub(crate) enum ShardSelector { Page(Key), } +/// A convenience for use with the re_attach ControlPlaneClient function: rather +/// than the serializable struct, we build this enum that encapsulates +/// the invariant that attached tenants always have generations. +/// +/// This represents the subset of a LocationConfig that we receive during re-attach. +pub(crate) enum TenantStartupMode { + Attached((AttachmentMode, Generation)), + Secondary, +} + +impl TenantStartupMode { + /// Return the generation & mode that should be used when starting + /// this tenant. + /// + /// If this returns None, the re-attach struct is in an invalid state and + /// should be ignored in the response. + fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { + match (rart.mode, rart.gen) { + (LocationConfigMode::Detached, _) => None, + (LocationConfigMode::Secondary, _) => Some(Self::Secondary), + (LocationConfigMode::AttachedMulti, Some(g)) => { + Some(Self::Attached((AttachmentMode::Multi, Generation::new(g)))) + } + (LocationConfigMode::AttachedSingle, Some(g)) => { + Some(Self::Attached((AttachmentMode::Single, Generation::new(g)))) + } + (LocationConfigMode::AttachedStale, Some(g)) => { + Some(Self::Attached((AttachmentMode::Stale, Generation::new(g)))) + } + _ => { + tracing::warn!( + "Received invalid re-attach state for tenant {}: {rart:?}", + rart.id + ); + None + } + } + } +} + impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, @@ -271,7 +311,7 @@ pub struct TenantManager { fn emergency_generations( tenant_confs: &HashMap>, -) -> HashMap { +) -> HashMap { tenant_confs .iter() .filter_map(|(tid, lc)| { @@ -279,12 +319,15 @@ fn emergency_generations( Ok(lc) => lc, Err(_) => return None, }; - let gen = match &lc.mode { - LocationMode::Attached(alc) => Some(alc.generation), - LocationMode::Secondary(_) => None, - }; - - gen.map(|g| (*tid, g)) + Some(( + *tid, + match &lc.mode { + LocationMode::Attached(alc) => { + TenantStartupMode::Attached((alc.attach_mode, alc.generation)) + } + LocationMode::Secondary(_) => TenantStartupMode::Secondary, + }, + )) }) .collect() } @@ -294,7 +337,7 @@ async fn init_load_generations( tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let generations = if conf.control_plane_emergency_mode { error!( "Emergency mode! Tenants will be attached unsafely using their last known generation" @@ -304,7 +347,12 @@ async fn init_load_generations( info!("Calling control plane API to re-attach tenants"); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { - Ok(tenants) => tenants, + Ok(tenants) => tenants + .into_iter() + .flat_map(|(id, rart)| { + TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm)) + }) + .collect(), Err(RetryForeverError::ShuttingDown) => { anyhow::bail!("Shut down while waiting for control plane re-attach response") } @@ -322,9 +370,17 @@ async fn init_load_generations( // Must only do this if remote storage is enabled, otherwise deletion queue // is not running and channel push will fail. if resources.remote_storage.is_some() { - resources - .deletion_queue_client - .recover(generations.clone())?; + let attached_tenants = generations + .iter() + .flat_map(|(id, start_mode)| { + match start_mode { + TenantStartupMode::Attached((_mode, generation)) => Some(generation), + TenantStartupMode::Secondary => None, + } + .map(|gen| (*id, *gen)) + }) + .collect(); + resources.deletion_queue_client.recover(attached_tenants)?; } Ok(Some(generations)) @@ -490,9 +546,8 @@ pub async fn init_tenant_mgr( // Scan local filesystem for attached tenants let tenant_configs = init_load_tenant_configs(conf).await?; - // Determine which tenants are to be attached - let tenant_generations = - init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + // Determine which tenants are to be secondary or attached, and in which generation + let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", @@ -522,97 +577,102 @@ pub async fn init_tenant_mgr( } }; - let generation = if let Some(generations) = &tenant_generations { + // FIXME: if we were attached, and get demoted to secondary on re-attach, we + // don't have a place to get a config. + // (https://github.com/neondatabase/neon/issues/5377) + const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig = + SecondaryLocationConfig { warm: true }; + + // Update the location config according to the re-attach response + if let Some(tenant_modes) = &tenant_modes { // We have a generation map: treat it as the authority for whether // this tenant is really attached. - if let Some(gen) = generations.get(&tenant_shard_id) { - if let LocationMode::Attached(attached) = &location_conf.mode { - if attached.generation > *gen { + match tenant_modes.get(&tenant_shard_id) { + None => { + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); + if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", + ); + } + + // We deleted local content: move on to next tenant, don't try and spawn this one. + continue; + } + Some(TenantStartupMode::Secondary) => { + if !matches!(location_conf.mode, LocationMode::Secondary(_)) { + location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); + } + } + Some(TenantStartupMode::Attached((attach_mode, generation))) => { + let old_gen_higher = match &location_conf.mode { + LocationMode::Attached(AttachedLocationConfig { + generation: old_generation, + attach_mode: _attach_mode, + }) => { + if old_generation > generation { + Some(old_generation) + } else { + None + } + } + _ => None, + }; + if let Some(old_generation) = old_gen_higher { tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", - attached.generation + "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", + old_generation ); // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away // local disk content: demote to secondary rather than detaching. - tenants.insert( - tenant_shard_id, - TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - location_conf.shard, - location_conf.tenant_conf.clone(), - &SecondaryLocationConfig { warm: false }, - )), - ); + location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); + } else { + location_conf.attach_in_generation(*attach_mode, *generation); } } - *gen - } else { - match &location_conf.mode { - LocationMode::Secondary(secondary_config) => { - // We do not require the control plane's permission for secondary mode - // tenants, because they do no remote writes and hence require no - // generation number - info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode"); - tenants.insert( - tenant_shard_id, - TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - location_conf.shard, - location_conf.tenant_conf, - secondary_config, - )), - ); - } - LocationMode::Attached(_) => { - // TODO: augment re-attach API to enable the control plane to - // instruct us about secondary attachments. That way, instead of throwing - // away local state, we can gracefully fall back to secondary here, if the control - // plane tells us so. - // (https://github.com/neondatabase/neon/issues/5377) - info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); - if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", - ); - } - } - }; - - continue; } } else { // Legacy mode: no generation information, any tenant present // on local disk may activate info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",); - Generation::none() }; // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. - location_conf.attach_in_generation(generation); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; let shard_identity = location_conf.shard; - match tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir_path, - resources.clone(), - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - Some(init_order.clone()), - &TENANTS, - SpawnMode::Lazy, - &ctx, - ) { - Ok(tenant) => { - tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant)); + let slot = match location_conf.mode { + LocationMode::Attached(attached_conf) => { + match tenant_spawn( + conf, + tenant_shard_id, + &tenant_dir_path, + resources.clone(), + AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + shard_identity, + Some(init_order.clone()), + &TENANTS, + SpawnMode::Lazy, + &ctx, + ) { + Ok(tenant) => TenantSlot::Attached(tenant), + Err(e) => { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); + continue; + } + } } - Err(e) => { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); - } - } + LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + shard_identity, + location_conf.tenant_conf, + &secondary_conf, + )), + }; + + tenants.insert(tenant_shard_id, slot); } info!("Processed {} local tenants at startup", tenants.len()); @@ -1662,9 +1722,9 @@ impl TenantManager { .layers .read() .await - .resident_layers() - .collect::>() - .await; + .likely_resident_layers() + .collect::>(); + for layer in timeline_layers { let relative_path = layer .local_path() @@ -2143,7 +2203,7 @@ pub(crate) async fn load_tenant( let mut location_conf = Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?; - location_conf.attach_in_generation(generation); + location_conf.attach_in_generation(AttachmentMode::Single, generation); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 82af7ed83b..40f19e3b05 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -11,6 +11,7 @@ use crate::{ disk_usage_eviction_task::{ finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, }, + is_temporary, metrics::SECONDARY_MODE, tenant::{ config::SecondaryLocationConfig, @@ -961,7 +962,10 @@ async fn init_timeline_state( // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config"); continue; - } else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) { + } else if crate::is_temporary(&file_path) + || is_temp_download_file(&file_path) + || is_temporary(&file_path) + { // Temporary files are frequently left behind from restarting during downloads tracing::info!("Cleaning up temporary file {file_path}"); if let Err(e) = tokio::fs::remove_file(&file_path) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index eed423c3e6..8ba37b5a86 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -32,6 +32,9 @@ use utils::generation::Generation; #[cfg(test)] mod tests; +#[cfg(test)] +mod failpoints; + /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// @@ -46,7 +49,41 @@ mod tests; /// An image layer is a snapshot of all the data in a key-range, at a single /// LSN. /// -/// This type models the on-disk layers, which can be evicted and on-demand downloaded. +/// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a +/// general goal, read accesses should always win eviction and eviction should not wait for +/// download. +/// +/// ### State transitions +/// +/// The internal state of `Layer` is composed of most importantly the on-filesystem state and the +/// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded, +/// right size) or deleted. +/// +/// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the +/// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the +/// `heavier_once_cell::InitPermit` has been acquired, any read request +/// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus +/// cancelling the eviction. +/// +/// ```text +/// +-----------------+ get_or_maybe_download +--------------------------------+ +/// | not initialized |--------------------------->| Resident(Arc) | +/// | ENOENT | /->| | +/// +-----------------+ | +--------------------------------+ +/// ^ | | ^ +/// | get_or_maybe_download | | | get_or_maybe_download, either: +/// evict_blocking | /-------------------------/ | | - upgrade weak to strong +/// | | | | - re-initialize without download +/// | | evict_and_wait | | +/// +-----------------+ v | +/// | not initialized | on_downloaded_layer_drop +--------------------------------------+ +/// | file is present |<---------------------------| WantedEvicted(Weak) | +/// +-----------------+ +--------------------------------------+ +/// ``` +/// +/// ### Unsupported +/// +/// - Evicting by the operator deleting files from the filesystem /// /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer #[derive(Clone)] @@ -211,8 +248,7 @@ impl Layer { /// /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction /// will happen regardless the future returned by this method completing unless there is a - /// read access (currently including [`Layer::keep_resident`]) before eviction gets to - /// complete. + /// read access before eviction gets to complete. /// /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation /// of download-evict cycle on retry. @@ -307,21 +343,28 @@ impl Layer { /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// - /// Returns None if the layer is currently evicted. - pub(crate) async fn keep_resident(&self) -> anyhow::Result> { - let downloaded = match self.0.get_or_maybe_download(false, None).await { - Ok(d) => d, - // technically there are a lot of possible errors, but in practice it should only be - // DownloadRequired which is tripped up. could work to improve this situation - // statically later. - Err(DownloadError::DownloadRequired) => return Ok(None), - Err(e) => return Err(e.into()), - }; + /// Returns None if the layer is currently evicted or becoming evicted. + #[cfg(test)] + pub(crate) async fn keep_resident(&self) -> Option { + let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?; - Ok(Some(ResidentLayer { + Some(ResidentLayer { downloaded, owner: self.clone(), - })) + }) + } + + /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal + /// with `EvictionError::NotFound`. + /// + /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or + /// will be unless a read happens soon. + pub(crate) fn is_likely_resident(&self) -> bool { + self.0 + .inner + .get() + .map(|rowe| rowe.is_likely_resident()) + .unwrap_or(false) } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. @@ -371,11 +414,11 @@ impl Layer { /// separatedly. #[cfg(any(feature = "testing", test))] pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { - let mut rx = self.0.status.subscribe(); + let mut rx = self.0.status.as_ref().unwrap().subscribe(); async move { loop { - if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await { + if rx.changed().await.is_err() { break; } } @@ -397,6 +440,32 @@ enum ResidentOrWantedEvicted { } impl ResidentOrWantedEvicted { + /// Non-mutating access to the a DownloadedLayer, if possible. + /// + /// This is not used on the read path (anything that calls + /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win + /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`]. + #[cfg(test)] + fn get(&self) -> Option> { + match self { + ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()), + ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(), + } + } + + /// Best-effort query for residency right now, not as strong guarantee as receiving a strong + /// reference from `ResidentOrWantedEvicted::get`. + fn is_likely_resident(&self) -> bool { + match self { + ResidentOrWantedEvicted::Resident(_) => true, + ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0, + } + } + + /// Upgrades any weak to strong if possible. + /// + /// Returns a strong reference if possible, along with a boolean telling if an upgrade + /// happened. fn get_and_upgrade(&mut self) -> Option<(Arc, bool)> { match self { ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)), @@ -417,7 +486,7 @@ impl ResidentOrWantedEvicted { /// /// Returns `Some` if this was the first time eviction was requested. Care should be taken to /// drop the possibly last strong reference outside of the mutex of - /// heavier_once_cell::OnceCell. + /// [`heavier_once_cell::OnceCell`]. fn downgrade(&mut self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => { @@ -445,6 +514,9 @@ struct LayerInner { desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. + /// + /// There should not be an access to timeline for any reason without entering the + /// [`Timeline::gate`] at the same time. timeline: Weak, /// Cached knowledge of [`Timeline::remote_client`] being `Some`. @@ -453,27 +525,38 @@ struct LayerInner { access_stats: LayerAccessStats, /// This custom OnceCell is backed by std mutex, but only held for short time periods. - /// Initialization and deinitialization are done while holding a permit. + /// + /// Filesystem changes (download, evict) are only done while holding a permit which the + /// `heavier_once_cell` provides. + /// + /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but + /// possibly read while not holding it. inner: heavier_once_cell::OnceCell, /// Do we want to delete locally and remotely this when `LayerInner` is dropped wanted_deleted: AtomicBool, - /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses - /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger - /// [`LayerInner::on_downloaded_layer_drop`]. - wanted_evicted: AtomicBool, - - /// Version is to make sure we will only evict a specific download of a file. + /// Version is to make sure we will only evict a specific initialization of the downloaded file. /// - /// Incremented for each download, stored in `DownloadedLayer::version` or + /// Incremented for each initialization, stored in `DownloadedLayer::version` or /// `ResidentOrWantedEvicted::WantedEvicted`. version: AtomicUsize, - /// Allow subscribing to when the layer actually gets evicted. - status: tokio::sync::broadcast::Sender, + /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download + /// starts, or completes. + /// + /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard. + /// Holding the InitPermit is the only time we can do state transitions, but we also need to + /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to + /// [`ResidentOrWantedEvicted::Resident`] on access. + /// + /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`]. + status: Option>, - /// Counter for exponential backoff with the download + /// Counter for exponential backoff with the download. + /// + /// This is atomic only for the purposes of having additional data only accessed while holding + /// the InitPermit. consecutive_failures: AtomicUsize, /// The generation of this Layer. @@ -491,7 +574,13 @@ struct LayerInner { /// a shard split since the layer was originally written. shard: ShardIndex, + /// When the Layer was last evicted but has not been downloaded since. + /// + /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`]. last_evicted_at: std::sync::Mutex>, + + #[cfg(test)] + failpoints: std::sync::Mutex>, } impl std::fmt::Display for LayerInner { @@ -508,16 +597,16 @@ impl AsLayerDesc for LayerInner { #[derive(Debug, Clone, Copy)] enum Status { + Resident, Evicted, - Downloaded, + Downloading, } impl Drop for LayerInner { fn drop(&mut self) { if !*self.wanted_deleted.get_mut() { - // should we try to evict if the last wish was for eviction? - // feels like there's some hazard of overcrowding near shutdown near by, but we don't - // run drops during shutdown (yet) + // should we try to evict if the last wish was for eviction? seems more like a hazard + // than a clear win. return; } @@ -528,9 +617,9 @@ impl Drop for LayerInner { let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); let meta = self.metadata(); - let status = self.status.clone(); + let status = self.status.take(); - crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { + Self::spawn_blocking(move || { let _g = span.entered(); // carry this until we are finished for [`Layer::wait_drop`] support @@ -605,12 +694,16 @@ impl LayerInner { .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) .join(desc.filename().to_string()); - let (inner, version) = if let Some(inner) = downloaded { + let (inner, version, init_status) = if let Some(inner) = downloaded { let version = inner.version; let resident = ResidentOrWantedEvicted::Resident(inner); - (heavier_once_cell::OnceCell::new(resident), version) + ( + heavier_once_cell::OnceCell::new(resident), + version, + Status::Resident, + ) } else { - (heavier_once_cell::OnceCell::default(), 0) + (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; LayerInner { @@ -621,14 +714,15 @@ impl LayerInner { have_remote_client: timeline.remote_client.is_some(), access_stats, wanted_deleted: AtomicBool::new(false), - wanted_evicted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), - status: tokio::sync::broadcast::channel(1).0, + status: Some(tokio::sync::watch::channel(init_status).0), consecutive_failures: AtomicUsize::new(0), generation, shard, last_evicted_at: std::sync::Mutex::default(), + #[cfg(test)] + failpoints: Default::default(), } } @@ -644,20 +738,34 @@ impl LayerInner { /// Cancellation safe, however dropping the future and calling this method again might result /// in a new attempt to evict OR join the previously started attempt. + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))] pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { - use tokio::sync::broadcast::error::RecvError; - assert!(self.have_remote_client); - let mut rx = self.status.subscribe(); + let mut rx = self.status.as_ref().unwrap().subscribe(); + + { + let current = rx.borrow_and_update(); + match &*current { + Status::Resident => { + // we might get lucky and evict this; continue + } + Status::Evicted | Status::Downloading => { + // it is already evicted + return Err(EvictionError::NotFound); + } + } + } let strong = { match self.inner.get() { - Some(mut either) => { - self.wanted_evicted.store(true, Ordering::Relaxed); - either.downgrade() + Some(mut either) => either.downgrade(), + None => { + // we already have a scheduled eviction, which just has not gotten to run yet. + // it might still race with a read access, but that could also get cancelled, + // so let's say this is not evictable. + return Err(EvictionError::NotFound); } - None => return Err(EvictionError::NotFound), } }; @@ -673,26 +781,26 @@ impl LayerInner { LAYER_IMPL_METRICS.inc_started_evictions(); } - match tokio::time::timeout(timeout, rx.recv()).await { - Ok(Ok(Status::Evicted)) => Ok(()), - Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded), - Ok(Err(RecvError::Closed)) => { - unreachable!("sender cannot be dropped while we are in &self method") - } - Ok(Err(RecvError::Lagged(_))) => { - // this is quite unlikely, but we are blocking a lot in the async context, so - // we might be missing this because we are stuck on a LIFO slot on a thread - // which is busy blocking for a 1TB database create_image_layers. - // - // use however late (compared to the initial expressing of wanted) as the - // "outcome" now - LAYER_IMPL_METRICS.inc_broadcast_lagged(); - match self.inner.get() { - Some(_) => Err(EvictionError::Downloaded), - None => Ok(()), - } - } - Err(_timeout) => Err(EvictionError::Timeout), + let changed = rx.changed(); + let changed = tokio::time::timeout(timeout, changed).await; + + let Ok(changed) = changed else { + return Err(EvictionError::Timeout); + }; + + let _: () = changed.expect("cannot be closed, because we are holding a strong reference"); + + let current = rx.borrow_and_update(); + + match &*current { + // the easiest case + Status::Evicted => Ok(()), + // it surely was evicted in between, but then there was a new access now; we can't know + // if it'll succeed so lets just call it evicted + Status::Downloading => Ok(()), + // either the download which was started after eviction completed already, or it was + // never evicted + Status::Resident => Err(EvictionError::Downloaded), } } @@ -703,32 +811,37 @@ impl LayerInner { ctx: Option<&RequestContext>, ) -> Result, DownloadError> { let (weak, permit) = { + // get_or_init_detached can: + // - be fast (mutex lock) OR uncontested semaphore permit acquire + // - be slow (wait for semaphore permit or closing) + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + let locked = self .inner .get_or_init_detached() .await .map(|mut guard| guard.get_and_upgrade().ok_or(guard)); + scopeguard::ScopeGuard::into_inner(init_cancelled); + match locked { // this path could had been a RwLock::read Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong), Ok(Ok((strong, _))) => { // when upgraded back, the Arc is still available, but - // previously a `evict_and_wait` was received. - self.wanted_evicted.store(false, Ordering::Relaxed); - - // error out any `evict_and_wait` - drop(self.status.send(Status::Downloaded)); + // previously a `evict_and_wait` was received. this is the only place when we + // send out an update without holding the InitPermit. + // + // note that we also have dropped the Guard; this is fine, because we just made + // a state change and are holding a strong reference to be returned. + self.status.as_ref().unwrap().send_replace(Status::Resident); LAYER_IMPL_METRICS .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); return Ok(strong); } - Ok(Err(mut guard)) => { - // path to here: the evict_blocking is stuck on spawn_blocking queue. - // - // reset the contents, deactivating the eviction and causing a - // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed. + Ok(Err(guard)) => { + // path to here: we won the eviction, the file should still be on the disk. let (weak, permit) = guard.take_and_deinit(); (Some(weak), permit) } @@ -744,86 +857,66 @@ impl LayerInner { ); } + let timeline = self + .timeline + .upgrade() + .ok_or_else(|| DownloadError::TimelineShutdown)?; + + // count cancellations, which currently remain largely unexpected + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + + // check if we really need to be downloaded: this can happen if a read access won the + // semaphore before eviction. + // + // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a + // pending eviction will try to evict even upon finding an uninitialized `self.inner`. + let needs_download = self + .needs_download() + .await + .map_err(DownloadError::PreStatFailed); + + scopeguard::ScopeGuard::into_inner(init_cancelled); + + let needs_download = needs_download?; + + let Some(reason) = needs_download else { + // the file is present locally because eviction has not had a chance to run yet + + #[cfg(test)] + self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload) + .await?; + + LAYER_IMPL_METRICS.inc_init_needed_no_download(); + + return Ok(self.initialize_after_layer_is_on_disk(permit)); + }; + + // we must download; getting cancelled before spawning the download is not an issue as + // any still running eviction would not find anything to evict. + + if let NeedsDownload::NotFile(ft) = reason { + return Err(DownloadError::NotFile(ft)); + } + + if timeline.remote_client.as_ref().is_none() { + return Err(DownloadError::NoRemoteStorage); + } + + if let Some(ctx) = ctx { + self.check_expected_download(ctx)?; + } + + if !allow_download { + // this is only used from tests, but it is hard to test without the boolean + return Err(DownloadError::DownloadRequired); + } + async move { - // disable any scheduled but not yet running eviction deletions for this - let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); - - // no need to make the evict_and_wait wait for the actual download to complete - drop(self.status.send(Status::Downloaded)); - - let timeline = self - .timeline - .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; - - // count cancellations, which currently remain largely unexpected - let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - - // check if we really need to be downloaded; could have been already downloaded by a - // cancelled previous attempt. - let needs_download = self - .needs_download() - .await - .map_err(DownloadError::PreStatFailed); - - let needs_download = match needs_download { - Ok(reason) => reason, - Err(e) => { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(e); - } - }; - - let Some(reason) = needs_download else { - scopeguard::ScopeGuard::into_inner(init_cancelled); - - // the file is present locally, probably by a previous but cancelled call to - // get_or_maybe_download. alternatively we might be running without remote storage. - LAYER_IMPL_METRICS.inc_init_needed_no_download(); - - let res = self.initialize_after_layer_is_on_disk(next_version, permit, false); - return Ok(res); - }; - - if let NeedsDownload::NotFile(ft) = reason { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(DownloadError::NotFile(ft)); - } - - // only reset this after we've decided we really need to download. otherwise it'd - // be impossible to mark cancelled downloads for eviction, like one could imagine - // we would like to do for prefetching which was not needed. - self.wanted_evicted.store(false, Ordering::Release); - - if timeline.remote_client.as_ref().is_none() { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(DownloadError::NoRemoteStorage); - } - - if let Some(ctx) = ctx { - let res = self.check_expected_download(ctx); - if let Err(e) = res { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(e); - } - } - - if !allow_download { - // this does look weird, but for LayerInner the "downloading" means also changing - // internal once related state ... - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(DownloadError::DownloadRequired); - } - tracing::info!(%reason, "downloading on-demand"); - let permit = self.spawn_download_and_wait(timeline, permit).await; - + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + let res = self.download_init_and_wait(timeline, permit).await?; scopeguard::ScopeGuard::into_inner(init_cancelled); - - let permit = permit?; - - let res = self.initialize_after_layer_is_on_disk(next_version, permit, true); Ok(res) } .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) @@ -857,11 +950,11 @@ impl LayerInner { } /// Actual download, at most one is executed at the time. - async fn spawn_download_and_wait( + async fn download_init_and_wait( self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, - ) -> Result { + ) -> Result, DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); let (tx, rx) = tokio::sync::oneshot::channel(); @@ -873,66 +966,35 @@ impl LayerInner { .enter() .map_err(|_| DownloadError::DownloadCancelled)?; - tokio::task::spawn(async move { - + Self::spawn( + async move { let _guard = guard; - let client = timeline - .remote_client + // now that we have commited to downloading, send out an update to: + // - unhang any pending eviction + // - break out of evict_and_wait + this.status .as_ref() - .expect("checked above with have_remote_client"); + .unwrap() + .send_replace(Status::Downloading); - let result = client.download_layer_file( - &this.desc.filename(), - &this.metadata(), - &timeline.cancel - ) - .await; + #[cfg(test)] + this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading) + .await + .unwrap(); - let result = match result { - Ok(size) => { - timeline.metrics.resident_physical_size_add(size); - Ok(()) - } - Err(e) => { - let consecutive_failures = - this.consecutive_failures.fetch_add(1, Ordering::Relaxed); + let res = this.download_and_init(timeline, permit).await; - let backoff = utils::backoff::exponential_backoff_duration_seconds( - consecutive_failures.min(u32::MAX as usize) as u32, - 1.5, - 60.0, - ); - - let backoff = std::time::Duration::from_secs_f64(backoff); - - tokio::select! { - _ = tokio::time::sleep(backoff) => {}, - _ = timeline.cancel.cancelled() => {}, - }; - - Err(e) - } - }; - - if let Err(res) = tx.send((result, permit)) { + if let Err(res) = tx.send(res) { match res { - (Ok(()), _) => { - // our caller is cancellation safe so this is fine; if someone - // else requests the layer, they'll find it already downloaded. - // - // See counter [`LayerImplMetrics::inc_init_needed_no_download`] - // - // FIXME(#6028): however, could be that we should consider marking the - // layer for eviction? alas, cannot: because only DownloadedLayer will - // handle that. - }, - (Err(e), _) => { - // our caller is cancellation safe, but we might be racing with - // another attempt to initialize. before we have cancellation - // token support: these attempts should converge regardless of - // their completion order. - tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}"); + Ok(_res) => { + tracing::debug!("layer initialized, but caller has been cancelled"); + LAYER_IMPL_METRICS.inc_init_completed_without_requester(); + } + Err(e) => { + tracing::info!( + "layer file download failed, and caller has been cancelled: {e:?}" + ); LAYER_IMPL_METRICS.inc_download_failed_without_requester(); } } @@ -942,41 +1004,102 @@ impl LayerInner { ); match rx.await { - Ok((Ok(()), permit)) => { - if let Some(reason) = self - .needs_download() - .await - .map_err(DownloadError::PostStatFailed)? - { - // this is really a bug in needs_download or remote timeline client - panic!("post-condition failed: needs_download returned {reason:?}"); - } - - self.consecutive_failures.store(0, Ordering::Relaxed); - tracing::info!(size=%self.desc.file_size, "on-demand download successful"); - - Ok(permit) - } - Ok((Err(e), _permit)) => { + Ok(Ok(res)) => Ok(res), + Ok(Err(e)) => { // sleep already happened in the spawned task, if it was not cancelled - let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed); - match e.downcast_ref::() { // If the download failed due to its cancellation token, // propagate the cancellation error upstream. Some(remote_storage::DownloadError::Cancelled) => { Err(DownloadError::DownloadCancelled) } - _ => { - tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); - Err(DownloadError::DownloadFailed) - } + // FIXME: this is not embedding the error because historically it would had + // been output to compute, however that is no longer the case. + _ => Err(DownloadError::DownloadFailed), } } Err(_gone) => Err(DownloadError::DownloadCancelled), } } + async fn download_and_init( + self: &Arc, + timeline: Arc, + permit: heavier_once_cell::InitPermit, + ) -> anyhow::Result> { + let client = timeline + .remote_client + .as_ref() + .expect("checked before download_init_and_wait"); + + let result = client + .download_layer_file(&self.desc.filename(), &self.metadata(), &timeline.cancel) + .await; + + match result { + Ok(size) => { + assert_eq!(size, self.desc.file_size); + + match self.needs_download().await { + Ok(Some(reason)) => { + // this is really a bug in needs_download or remote timeline client + panic!("post-condition failed: needs_download returned {reason:?}"); + } + Ok(None) => { + // as expected + } + Err(e) => { + panic!("post-condition failed: needs_download errored: {e:?}"); + } + } + + tracing::info!(size=%self.desc.file_size, "on-demand download successful"); + timeline + .metrics + .resident_physical_size_add(self.desc.file_size); + self.consecutive_failures.store(0, Ordering::Relaxed); + + let since_last_eviction = self + .last_evicted_at + .lock() + .unwrap() + .take() + .map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + } + + self.access_stats.record_residence_event( + LayerResidenceStatus::Resident, + LayerResidenceEventReason::ResidenceChange, + ); + + Ok(self.initialize_after_layer_is_on_disk(permit)) + } + Err(e) => { + let consecutive_failures = + 1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + + tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); + + let backoff = utils::backoff::exponential_backoff_duration_seconds( + consecutive_failures.min(u32::MAX as usize) as u32, + 1.5, + 60.0, + ); + + let backoff = std::time::Duration::from_secs_f64(backoff); + + tokio::select! { + _ = tokio::time::sleep(backoff) => {}, + _ = timeline.cancel.cancelled() => {}, + }; + + Err(e) + } + } + } + /// Initializes the `Self::inner` to a "resident" state. /// /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download` @@ -986,25 +1109,13 @@ impl LayerInner { /// changes are made before we can write to the OnceCell in non-cancellable fashion. fn initialize_after_layer_is_on_disk( self: &Arc, - next_version: usize, permit: heavier_once_cell::InitPermit, - downloaded: bool, ) -> Arc { debug_assert_current_span_has_tenant_and_timeline_id(); - if downloaded { - let since_last_eviction = self - .last_evicted_at - .lock() - .unwrap() - .take() - .map(|ts| ts.elapsed()); - if let Some(since_last_eviction) = since_last_eviction { - // FIXME: this will not always be recorded correctly until #6028 (the no - // download needed branch above) - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); - } - } + // disable any scheduled but not yet running eviction deletions for this initialization + let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + self.status.as_ref().unwrap().send_replace(Status::Resident); let res = Arc::new(DownloadedLayer { owner: Arc::downgrade(self), @@ -1012,15 +1123,9 @@ impl LayerInner { version: next_version, }); - // FIXME: this might now be double-accounted for !downloaded - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); - let waiters = self.inner.initializer_count(); if waiters > 0 { - tracing::info!(waiters, "completing the on-demand download for other tasks"); + tracing::info!(waiters, "completing layer init for other tasks"); } let value = ResidentOrWantedEvicted::Resident(res.clone()); @@ -1063,9 +1168,11 @@ impl LayerInner { fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { let layer_file_name = self.desc.filename().file_name(); - // this is not accurate: we could have the file locally but there was a cancellation - // and now we are not in sync, or we are currently downloading it. - let remote = self.inner.get().is_none(); + let resident = self + .inner + .get() + .map(|rowe| rowe.is_likely_resident()) + .unwrap_or(false); let access_stats = self.access_stats.as_api_model(reset); @@ -1077,7 +1184,7 @@ impl LayerInner { layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, - remote, + remote: !resident, access_stats, } } else { @@ -1087,94 +1194,195 @@ impl LayerInner { layer_file_name, layer_file_size: self.desc.file_size, lsn_start: lsn, - remote, + remote: !resident, access_stats, } } } /// `DownloadedLayer` is being dropped, so it calls this method. - fn on_downloaded_layer_drop(self: Arc, version: usize) { - let evict = self.wanted_evicted.load(Ordering::Acquire); + fn on_downloaded_layer_drop(self: Arc, only_version: usize) { let can_evict = self.have_remote_client; - if can_evict && evict { - let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version); + // we cannot know without inspecting LayerInner::inner if we should evict or not, even + // though here it is very likely + let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version); - // downgrade for queueing, in case there's a tear down already ongoing we should not - // hold it alive. - let this = Arc::downgrade(&self); - drop(self); - - // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might - // drop while the `self.inner` is being locked, leading to a deadlock. - - crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { - let _g = span.entered(); - - // if LayerInner is already dropped here, do nothing because the delete on drop - // has already ran while we were in queue - let Some(this) = this.upgrade() else { - LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); - return; - }; - match this.evict_blocking(version) { - Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), - Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason), - } + if !can_evict { + // it would be nice to assert this case out, but we are in drop + span.in_scope(|| { + tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage"); }); + return; } + + // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might + // drop while the `self.inner` is being locked, leading to a deadlock. + + let start_evicting = async move { + #[cfg(test)] + self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting) + .await + .expect("failpoint should not have errored"); + + tracing::debug!("eviction started"); + + let res = self.wait_for_turn_and_evict(only_version).await; + // metrics: ignore the Ok branch, it is not done yet + if let Err(e) = res { + tracing::debug!(res=?Err::<(), _>(&e), "eviction completed"); + LAYER_IMPL_METRICS.inc_eviction_cancelled(e); + } + }; + + Self::spawn(start_evicting.instrument(span)); } - fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> { - // deleted or detached timeline, don't do anything. - let Some(timeline) = self.timeline.upgrade() else { - return Err(EvictionCancelled::TimelineGone); - }; + async fn wait_for_turn_and_evict( + self: Arc, + only_version: usize, + ) -> Result<(), EvictionCancelled> { + fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> { + use Status::*; + match status { + Resident => Ok(()), + Evicted => Err(EvictionCancelled::UnexpectedEvictedState), + Downloading => Err(EvictionCancelled::LostToDownload), + } + } + + let timeline = self + .timeline + .upgrade() + .ok_or(EvictionCancelled::TimelineGone)?; + + let mut rx = self + .status + .as_ref() + .expect("LayerInner cannot be dropped, holding strong ref") + .subscribe(); + + is_good_to_continue(&rx.borrow_and_update())?; let Ok(_gate) = timeline.gate.enter() else { return Err(EvictionCancelled::TimelineGone); }; - // to avoid starting a new download while we evict, keep holding on to the - // permit. - let _permit = { - let maybe_downloaded = self.inner.get(); + let permit = { + // we cannot just `std::fs::remove_file` because there might already be an + // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem + // operations must be done while holding the heavier_once_cell::InitPermit + let mut wait = std::pin::pin!(self.inner.get_or_init_detached()); - let (_weak, permit) = match maybe_downloaded { - Some(mut guard) => { - if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard { - if *version == only_version { - guard.take_and_deinit() - } else { - // this was not for us; maybe there's another eviction job - // TODO: does it make any sense to stall here? unique versions do not - // matter, we only want to make sure not to evict a resident, which we - // are not doing. - return Err(EvictionCancelled::VersionCheckFailed); - } - } else { - return Err(EvictionCancelled::AlreadyReinitialized); + let waited = loop { + // we must race to the Downloading starting, otherwise we would have to wait until the + // completion of the download. waiting for download could be long and hinder our + // efforts to alert on "hanging" evictions. + tokio::select! { + res = &mut wait => break res, + _ = rx.changed() => { + is_good_to_continue(&rx.borrow_and_update())?; + // two possibilities for Status::Resident: + // - the layer was found locally from disk by a read + // - we missed a bunch of updates and now the layer is + // again downloaded -- assume we'll fail later on with + // version check or AlreadyReinitialized } } - None => { - // already deinitialized, perhaps get_or_maybe_download did this and is - // currently waiting to reinitialize it - return Err(EvictionCancelled::LostToDownload); + }; + + // re-check now that we have the guard or permit; all updates should have happened + // while holding the permit. + is_good_to_continue(&rx.borrow_and_update())?; + + // the term deinitialize is used here, because we clearing out the Weak will eventually + // lead to deallocating the reference counted value, and the value we + // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned. + let (_weak, permit) = match waited { + Ok(guard) => { + match &*guard { + ResidentOrWantedEvicted::WantedEvicted(_weak, version) + if *version == only_version => + { + tracing::debug!(version, "deinitializing matching WantedEvicted"); + let (weak, permit) = guard.take_and_deinit(); + (Some(weak), permit) + } + ResidentOrWantedEvicted::WantedEvicted(_, version) => { + // if we were not doing the version check, we would need to try to + // upgrade the weak here to see if it really is dropped. version check + // is done instead assuming that it is cheaper. + tracing::debug!( + version, + only_version, + "version mismatch, not deinitializing" + ); + return Err(EvictionCancelled::VersionCheckFailed); + } + ResidentOrWantedEvicted::Resident(_) => { + return Err(EvictionCancelled::AlreadyReinitialized); + } + } + } + Err(permit) => { + tracing::debug!("continuing after cancelled get_or_maybe_download or eviction"); + (None, permit) } }; permit }; - // now accesses to inner.get_or_init wait on the semaphore or the `_permit` + let span = tracing::Span::current(); - self.access_stats.record_residence_event( - LayerResidenceStatus::Evicted, - LayerResidenceEventReason::ResidenceChange, - ); + let spawned_at = std::time::Instant::now(); - let res = match capture_mtime_and_remove(&self.path) { + // this is on purpose a detached spawn; we don't need to wait for it + // + // eviction completion reporting is the only thing hinging on this, and it can be just as + // well from a spawn_blocking thread. + // + // important to note that now that we've acquired the permit we have made sure the evicted + // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case + // there are multiple evictions. The rest is not cancellable, and we've now commited to + // evicting. + // + // If spawn_blocking has a queue and maximum number of threads are in use, we could stall + // reads. We will need to add cancellation for that if necessary. + Self::spawn_blocking(move || { + let _span = span.entered(); + + let res = self.evict_blocking(&timeline, &permit); + + let waiters = self.inner.initializer_count(); + + if waiters > 0 { + LAYER_IMPL_METRICS.inc_evicted_with_waiters(); + } + + let completed_in = spawned_at.elapsed(); + LAYER_IMPL_METRICS.record_time_to_evict(completed_in); + + match res { + Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), + Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e), + } + + tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed"); + }); + + Ok(()) + } + + /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs. + fn evict_blocking( + &self, + timeline: &Timeline, + _permit: &heavier_once_cell::InitPermit, + ) -> Result<(), EvictionCancelled> { + // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit` + + match capture_mtime_and_remove(&self.path) { Ok(local_layer_mtime) => { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { @@ -1198,33 +1406,60 @@ impl LayerInner { timeline .metrics .resident_physical_size_sub(self.desc.file_size); - - Ok(()) } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { tracing::error!( layer_size = %self.desc.file_size, - "failed to evict layer from disk, it was already gone (metrics will be inaccurate)" + "failed to evict layer from disk, it was already gone" ); - Err(EvictionCancelled::FileNotFound) + return Err(EvictionCancelled::FileNotFound); } Err(e) => { + // FIXME: this should probably be an abort tracing::error!("failed to evict file from disk: {e:#}"); - Err(EvictionCancelled::RemoveFailed) + return Err(EvictionCancelled::RemoveFailed); } - }; + } - // we are still holding the permit, so no new spawn_download_and_wait can happen - drop(self.status.send(Status::Evicted)); + self.access_stats.record_residence_event( + LayerResidenceStatus::Evicted, + LayerResidenceEventReason::ResidenceChange, + ); + + self.status.as_ref().unwrap().send_replace(Status::Evicted); *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); - res + Ok(()) } fn metadata(&self) -> LayerFileMetadata { LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard) } + + /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. + /// + /// Synchronizing with spawned tasks is very complicated otherwise. + fn spawn(fut: F) + where + F: std::future::Future + Send + 'static, + { + #[cfg(test)] + tokio::task::spawn(fut); + #[cfg(not(test))] + crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut); + } + + /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. + fn spawn_blocking(f: F) + where + F: FnOnce() + Send + 'static, + { + #[cfg(test)] + tokio::task::spawn_blocking(f); + #[cfg(not(test))] + crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f); + } } fn capture_mtime_and_remove(path: &Utf8Path) -> Result { @@ -1268,8 +1503,10 @@ pub(crate) enum DownloadError { DownloadCancelled, #[error("pre-condition: stat before download failed")] PreStatFailed(#[source] std::io::Error), - #[error("post-condition: stat after download failed")] - PostStatFailed(#[source] std::io::Error), + + #[cfg(test)] + #[error("failpoint: {0:?}")] + Failpoint(failpoints::FailpointKind), } #[derive(Debug, PartialEq)] @@ -1316,6 +1553,7 @@ impl Drop for DownloadedLayer { owner.on_downloaded_layer_drop(self.version); } else { // no need to do anything, we are shutting down + LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); } } } @@ -1556,6 +1794,7 @@ pub(crate) struct LayerImplMetrics { rare_counters: enum_map::EnumMap, inits_cancelled: metrics::core::GenericCounter, redownload_after: metrics::Histogram, + time_to_evict: metrics::Histogram, } impl Default for LayerImplMetrics { @@ -1651,6 +1890,13 @@ impl Default for LayerImplMetrics { .unwrap() }; + let time_to_evict = metrics::register_histogram!( + "pageserver_layer_eviction_held_permit_seconds", + "Time eviction held the permit.", + vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000] + ) + .unwrap(); + Self { started_evictions, completed_evictions, @@ -1663,6 +1909,7 @@ impl Default for LayerImplMetrics { rare_counters, inits_cancelled, redownload_after, + time_to_evict, } } } @@ -1694,6 +1941,12 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::RemoveOnDropFailed].inc(); } + /// Expected rare just as cancellations are rare, but we could have cancellations separate from + /// the single caller which can start the download, so use this counter to separte them. + fn inc_init_completed_without_requester(&self) { + self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc(); + } + /// Expected rare because cancellations are unexpected, and failures are unexpected fn inc_download_failed_without_requester(&self) { self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc(); @@ -1718,10 +1971,6 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::PermanentLoadingFailure].inc(); } - fn inc_broadcast_lagged(&self) { - self.rare_counters[RareEvent::EvictAndWaitLagged].inc(); - } - fn inc_init_cancelled(&self) { self.inits_cancelled.inc() } @@ -1729,9 +1978,22 @@ impl LayerImplMetrics { fn record_redownloaded_after(&self, duration: std::time::Duration) { self.redownload_after.observe(duration.as_secs_f64()) } + + /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably + /// instead cancel eviction if we would have read waiters. We cannot however separate reads + /// from other evictions, so this could have noise as well. + fn inc_evicted_with_waiters(&self) { + self.rare_counters[RareEvent::EvictedWithWaiters].inc(); + } + + /// Recorded at least initially as the permit is now acquired in async context before + /// spawn_blocking action. + fn record_time_to_evict(&self, duration: std::time::Duration) { + self.time_to_evict.observe(duration.as_secs_f64()) + } } -#[derive(enum_map::Enum)] +#[derive(Debug, Clone, Copy, enum_map::Enum)] enum EvictionCancelled { LayerGone, TimelineGone, @@ -1743,6 +2005,7 @@ enum EvictionCancelled { LostToDownload, /// After eviction, there was a new layer access which cancelled the eviction. UpgradedBackOnAccess, + UnexpectedEvictedState, } impl EvictionCancelled { @@ -1756,6 +2019,7 @@ impl EvictionCancelled { EvictionCancelled::AlreadyReinitialized => "already_reinitialized", EvictionCancelled::LostToDownload => "lost_to_download", EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access", + EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state", } } } @@ -1778,11 +2042,12 @@ impl DeleteFailed { #[derive(enum_map::Enum)] enum RareEvent { RemoveOnDropFailed, + InitCompletedWithoutRequester, DownloadFailedWithoutRequester, UpgradedWantedEvicted, InitWithoutDownload, PermanentLoadingFailure, - EvictAndWaitLagged, + EvictedWithWaiters, } impl RareEvent { @@ -1791,11 +2056,12 @@ impl RareEvent { match self { RemoveOnDropFailed => "remove_on_drop_failed", + InitCompletedWithoutRequester => "init_completed_without", DownloadFailedWithoutRequester => "download_failed_without", UpgradedWantedEvicted => "raced_wanted_evicted", InitWithoutDownload => "init_needed_no_download", PermanentLoadingFailure => "permanent_loading_failure", - EvictAndWaitLagged => "broadcast_lagged", + EvictedWithWaiters => "evicted_with_waiters", } } } diff --git a/pageserver/src/tenant/storage_layer/layer/failpoints.rs b/pageserver/src/tenant/storage_layer/layer/failpoints.rs new file mode 100644 index 0000000000..6cedc41d98 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs @@ -0,0 +1,119 @@ +//! failpoints for unit tests, implying `#[cfg(test)]`. +//! +//! These are not accessible over http. + +use super::*; + +impl Layer { + /// Enable a failpoint from a unit test. + pub(super) fn enable_failpoint(&self, failpoint: Failpoint) { + self.0.failpoints.lock().unwrap().push(failpoint); + } +} + +impl LayerInner { + /// Query if this failpoint is enabled, as in, arrive at a failpoint. + /// + /// Calls to this method need to be `#[cfg(test)]` guarded. + pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> { + let fut = { + let mut fps = self.failpoints.lock().unwrap(); + // find the *last* failpoint for cases in which we need to use multiple for the same + // thing (two blocked evictions) + let fp = fps.iter_mut().rfind(|x| x.kind() == kind); + + let Some(fp) = fp else { + return Ok(()); + }; + + fp.hit() + }; + + fut.await + } +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum FailpointKind { + /// Failpoint acts as an accurate cancelled by drop here; see the only site of use. + AfterDeterminingLayerNeedsNoDownload, + /// Failpoint for stalling eviction starting + WaitBeforeStartingEvicting, + /// Failpoint hit in the spawned task + WaitBeforeDownloading, +} + +pub(crate) enum Failpoint { + AfterDeterminingLayerNeedsNoDownload, + WaitBeforeStartingEvicting( + Option, + utils::completion::Barrier, + ), + WaitBeforeDownloading( + Option, + utils::completion::Barrier, + ), +} + +impl Failpoint { + fn kind(&self) -> FailpointKind { + match self { + Failpoint::AfterDeterminingLayerNeedsNoDownload => { + FailpointKind::AfterDeterminingLayerNeedsNoDownload + } + Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting, + Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading, + } + } + + fn hit(&mut self) -> impl std::future::Future> + 'static { + use futures::future::FutureExt; + + // use boxed futures to avoid Either hurdles + match self { + Failpoint::AfterDeterminingLayerNeedsNoDownload => { + let kind = self.kind(); + + async move { Err(FailpointHit(kind)) }.boxed() + } + Failpoint::WaitBeforeStartingEvicting(arrival, b) + | Failpoint::WaitBeforeDownloading(arrival, b) => { + // first one signals arrival + drop(arrival.take()); + + let b = b.clone(); + + async move { + tracing::trace!("waiting on a failpoint barrier"); + b.wait().await; + tracing::trace!("done waiting on a failpoint barrier"); + Ok(()) + } + .boxed() + } + } + } +} + +impl std::fmt::Display for FailpointKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(self, f) + } +} + +#[derive(Debug)] +pub(crate) struct FailpointHit(FailpointKind); + +impl std::fmt::Display for FailpointHit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(self, f) + } +} + +impl std::error::Error for FailpointHit {} + +impl From for DownloadError { + fn from(value: FailpointHit) -> Self { + DownloadError::Failpoint(value.0) + } +} diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index e7b2eb025a..247ff123b5 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,14 +1,13 @@ -use futures::StreamExt; use pageserver_api::key::CONTROLFILE_KEY; use tokio::task::JoinSet; -use tracing::Instrument; use utils::{ completion::{self, Completion}, id::TimelineId, }; +use super::failpoints::{Failpoint, FailpointKind}; use super::*; -use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME}; +use crate::context::DownloadBehavior; use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; /// Used in tests to advance a future to wanted await point, and not futher. @@ -21,7 +20,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s /// Demonstrate the API and resident -> evicted -> resident -> deleted transitions. #[tokio::test] async fn smoke_test() { - let handle = BACKGROUND_RUNTIME.handle(); + let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("smoke_test").unwrap(); let span = h.span(); @@ -38,7 +37,7 @@ async fn smoke_test() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.resident_layers().collect::>().await + layers.likely_resident_layers().collect::>() }; assert_eq!(layers.len(), 1); @@ -88,7 +87,7 @@ async fn smoke_test() { // // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to // artificially slow it down. - let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await; match layer .evict_and_wait(std::time::Duration::ZERO) @@ -99,7 +98,7 @@ async fn smoke_test() { // expected, but note that the eviction is "still ongoing" helper.release().await; // exhaust spawn_blocking pool to ensure it is now complete - SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle) + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle) .await; } other => unreachable!("{other:?}"), @@ -108,7 +107,7 @@ async fn smoke_test() { // only way to query if a layer is resident is to acquire a ResidentLayer instance. // Layer::keep_resident never downloads, but it might initialize if the layer file is found // downloaded locally. - let none = layer.keep_resident().await.unwrap(); + let none = layer.keep_resident().await; assert!( none.is_none(), "Expected none, because eviction removed the local file, found: {none:?}" @@ -167,6 +166,7 @@ async fn smoke_test() { rtc.wait_completion().await.unwrap(); assert_eq!(rtc.get_remote_physical_size(), 0); + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } /// This test demonstrates a previous hang when a eviction and deletion were requested at the same @@ -174,7 +174,7 @@ async fn smoke_test() { #[tokio::test(start_paused = true)] async fn evict_and_wait_on_wanted_deleted() { // this is the runtime on which Layer spawns the blocking tasks on - let handle = BACKGROUND_RUNTIME.handle(); + let handle = tokio::runtime::Handle::current(); let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap(); utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); @@ -188,7 +188,7 @@ async fn evict_and_wait_on_wanted_deleted() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.resident_layers().collect::>().await + layers.likely_resident_layers().collect::>() }; assert_eq!(layers.len(), 1); @@ -213,11 +213,11 @@ async fn evict_and_wait_on_wanted_deleted() { drop(resident); // make sure the eviction task gets to run - SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; let resident = layer.keep_resident().await; assert!( - matches!(resident, Ok(None)), + resident.is_none(), "keep_resident should not have re-initialized: {resident:?}" ); @@ -235,24 +235,408 @@ async fn evict_and_wait_on_wanted_deleted() { layers.finish_gc_timeline(&[layer]); } - SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get()); assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get()); assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) } -/// This test shows that ensures we are able to read the layer while the layer eviction has been -/// started but not completed due to spawn_blocking pool being blocked. -/// -/// Here `Layer::keep_resident` is used to "simulate" reads, because it cannot download. -#[tokio::test(start_paused = true)] -async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { - // this is the runtime on which Layer spawns the blocking tasks on - let handle = BACKGROUND_RUNTIME.handle(); - let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking") +/// This test ensures we are able to read the layer while the layer eviction has been +/// started but not completed. +#[test] +fn read_wins_pending_eviction() { + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .start_paused(true) + .build() .unwrap(); + + rt.block_on(async move { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("read_wins_pending_eviction").unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + let (completion, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + // now the eviction cannot proceed because the threads are consumed while completion exists + drop(resident); + arrived_at_barrier.wait().await; + assert!(!layer.is_likely_resident()); + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .0 + .get_or_maybe_download(false, None) + .instrument(download_span) + .await + .expect("should had reinitialized without downloading"); + + assert!(layer.is_likely_resident()); + + // reinitialization notifies of new resident status, which should error out all evict_and_wait + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because get_or_maybe_download re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because of a failpoint + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + drop(completion); + + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) + .await; + + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // now we finally can observe the original eviction failing + // it would had been possible to observe it earlier, but here it is guaranteed to have + // happened. + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) + }); +} + +/// Use failpoint to delay an eviction starting to get a VersionCheckFailed. +#[test] +fn multiple_pending_evictions_in_order() { + let name = "multiple_pending_evictions_in_order"; + let in_order = true; + multiple_pending_evictions_scenario(name, in_order); +} + +/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState. +#[test] +fn multiple_pending_evictions_out_of_order() { + let name = "multiple_pending_evictions_out_of_order"; + let in_order = false; + multiple_pending_evictions_scenario(name, in_order); +} + +fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .start_paused(true) + .build() + .unwrap(); + + rt.block_on(async move { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create(name).unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + let (completion1, barrier) = utils::completion::channel(); + let mut completion1 = Some(completion1); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + // now the eviction cannot proceed because we are simulating arbitrary long delay for the + // eviction task start. + drop(resident); + assert!(!layer.is_likely_resident()); + + arrived_at_barrier.wait().await; + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .0 + .get_or_maybe_download(false, None) + .instrument(download_span) + .await + .expect("should had reinitialized without downloading"); + + assert!(layer.is_likely_resident()); + + // reinitialization notifies of new resident status, which should error out all evict_and_wait + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because get_or_maybe_download re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because of a failpoint + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // configure another failpoint for the second eviction -- evictions are per initialization, + // so now that we've reinitialized the inner, we get to run two of them at the same time. + let (completion2, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // advance to the wait on the queue + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect_err("timeout because failpoint is blocking"); + + arrived_at_barrier.wait().await; + + assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get()); + + let mut release_earlier_eviction = |expected_reason| { + assert_eq!( + 0, + LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), + ); + + drop(completion1.take().unwrap()); + + let handle = &handle; + + async move { + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0( + handle, 1, + ) + .await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), + ); + } + }; + + if in_order { + release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await; + } + + // release the later eviction which is for the current version + drop(completion2); + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) + .await; + + if !in_order { + release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await; + } + + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect("eviction goes through now that spawn_blocking is unclogged") + .expect("eviction should succeed, because version matches"); + + assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + + // ensure the cancelled are unchanged + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) + }); +} + +/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently +/// a `Layer::keep_resident` call. +/// +/// This matters because cancelling the eviction would leave us in a state where the file is on +/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to +/// have non-repairing `Layer::is_likely_resident`. +#[tokio::test(start_paused = true)] +async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { + let handle = tokio::runtime::Handle::current(); + let h = + TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an + // Err) at the right time as in "during" the `LayerInner::needs_download`. + layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload); + + let (completion, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER)) + .await + .expect_err("should had advanced to waiting on channel"); + + arrived_at_barrier.wait().await; + + // simulate a cancelled read which is cancelled before it gets to re-initialize + let e = layer + .0 + .get_or_maybe_download(false, None) + .await + .unwrap_err(); + assert!( + matches!( + e, + DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload) + ), + "{e:?}" + ); + + assert!( + layer.0.needs_download().await.unwrap().is_none(), + "file is still on disk" + ); + + // release the eviction task + drop(completion); + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + // failpoint is still enabled, but it is not hit + let e = layer + .0 + .get_or_maybe_download(false, None) + .await + .unwrap_err(); + assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}"); + + // failpoint is not counted as cancellation either + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) +} + +#[tokio::test(start_paused = true)] +async fn evict_and_wait_does_not_wait_for_download() { + // let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap(); let (tenant, ctx) = h.load().await; let span = h.span(); let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); @@ -265,7 +649,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { let layer = { let mut layers = { let layers = timeline.layers.read().await; - layers.resident_layers().collect::>().await + layers.likely_resident_layers().collect::>() }; assert_eq!(layers.len(), 1); @@ -273,91 +657,76 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { layers.swap_remove(0) }; - // setup done - - let resident = layer.keep_resident().await.unwrap(); + // kind of forced setup: start an eviction but do not allow it progress until we are + // downloading + let (eviction_can_continue, barrier) = utils::completion::channel(); + let (arrival, eviction_arrived) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); - // drive the future to await on the status channel + // use this once-awaited other_evict to synchronize with the eviction + let other_evict = layer.evict_and_wait(FOREVER); + tokio::time::timeout(ADVANCE, &mut evict_and_wait) .await - .expect_err("should had been a timeout since we are holding the layer resident"); - assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + .expect_err("should had advanced"); + eviction_arrived.wait().await; + drop(eviction_can_continue); + other_evict.await.unwrap(); - // clog up BACKGROUND_RUNTIME spawn_blocking - let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver + assert!(!layer.is_likely_resident()); - // now the eviction cannot proceed because the threads are consumed while completion exists - drop(resident); + // following new evict_and_wait will fail until we've completed the download + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound), "{e:?}"); - // because no actual eviction happened, we get to just reinitialize the DownloadedLayer - layer - .keep_resident() - .instrument(download_span) - .await - .expect("keep_resident should had reinitialized without downloading") - .expect("ResidentLayer"); + let (download_can_continue, barrier) = utils::completion::channel(); + let (arrival, _download_arrived) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); - // because the keep_resident check alters wanted evicted without sending a message, we will - // never get completed - let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) - .await - .expect("no timeout, because keep_resident re-initialized") - .expect_err("eviction should not have succeeded because re-initialized"); + let mut download = std::pin::pin!(layer + .0 + .get_or_maybe_download(true, None) + .instrument(download_span)); - // works as intended: evictions lose to "downloads" - assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); - assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); - - // this is not wrong: the eviction is technically still "on the way" as it's still queued - // because spawn_blocking is clogged up - assert_eq!( - 0, - LAYER_IMPL_METRICS - .cancelled_evictions - .values() - .map(|ctr| ctr.get()) - .sum::() + assert!( + !layer.is_likely_resident(), + "during download layer is evicted" ); - let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); - - // advance to the wait on the queue - tokio::time::timeout(ADVANCE, &mut second_eviction) + tokio::time::timeout(ADVANCE, &mut download) .await - .expect_err("timeout because spawn_blocking is clogged"); + .expect_err("should had timed out because of failpoint"); - // in this case we don't leak started evictions, but I think there is still a chance of that - // happening, because we could have upgrades race multiple evictions while only one of them - // happens? - assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get()); + // now we finally get to continue, and because the latest state is downloading, we deduce that + // original eviction succeeded + evict_and_wait.await.unwrap(); - helper.release().await; + // however a new evict_and_wait will fail + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound), "{e:?}"); - // the second_eviction gets to run here - // - // synchronize to be *strictly* after the second_eviction spawn_blocking run - SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await; + assert!(!layer.is_likely_resident()); - tokio::time::timeout(ADVANCE, &mut second_eviction) - .await - .expect("eviction goes through now that spawn_blocking is unclogged") - .expect("eviction should succeed, because version matches"); + drop(download_can_continue); + download.await.expect("download should had succeeded"); + assert!(layer.is_likely_resident()); - assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + // only now can we evict + layer.evict_and_wait(FOREVER).await.unwrap(); +} - // now we finally can observe the original spawn_blocking failing - // it would had been possible to observe it earlier, but here it is guaranteed to have - // happened. - assert_eq!( - 1, - LAYER_IMPL_METRICS - .cancelled_evictions - .values() - .map(|ctr| ctr.get()) - .sum::() - ); +#[test] +fn layer_size() { + assert_eq!(std::mem::size_of::(), 2040); + assert_eq!(std::mem::size_of::(), 104); + assert_eq!(std::mem::size_of::(), 2328); + // it also has the utf8 path } struct SpawnBlockingPoolHelper { @@ -374,31 +743,41 @@ impl SpawnBlockingPoolHelper { /// /// This should be no issue nowdays, because nextest runs each test in it's own process. async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self { - let (completion, barrier) = completion::channel(); - let (tx, mut rx) = tokio::sync::mpsc::channel(8); + let default_max_blocking_threads = 512; - let assumed_max_blocking_threads = 512; + Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await + } + + async fn consume_all_spawn_blocking_threads0( + handle: &tokio::runtime::Handle, + threads: usize, + ) -> Self { + assert_ne!(threads, 0); + + let (completion, barrier) = completion::channel(); + let (started, starts_completed) = completion::channel(); let mut blocking_tasks = JoinSet::new(); - for _ in 0..assumed_max_blocking_threads { + for _ in 0..threads { let barrier = barrier.clone(); - let tx = tx.clone(); + let started = started.clone(); blocking_tasks.spawn_blocking_on( move || { - tx.blocking_send(()).unwrap(); - drop(tx); + drop(started); tokio::runtime::Handle::current().block_on(barrier.wait()); }, handle, ); } + drop(started); + + starts_completed.wait().await; + drop(barrier); - for _ in 0..assumed_max_blocking_threads { - rx.recv().await.unwrap(); - } + tracing::trace!("consumed all threads"); SpawnBlockingPoolHelper { awaited_by_spawn_blocking_tasks: completion, @@ -418,13 +797,22 @@ impl SpawnBlockingPoolHelper { while let Some(res) = blocking_tasks.join_next().await { res.expect("none of the tasks should had panicked"); } + + tracing::trace!("released all threads"); } /// In the tests it is used as an easy way of making sure something scheduled on the target /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed /// before our tasks have a chance to schedule and complete. async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) { - Self::consume_all_spawn_blocking_threads(handle) + Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await + } + + async fn consume_and_release_all_of_spawn_blocking_threads0( + handle: &tokio::runtime::Handle, + threads: usize, + ) { + Self::consume_all_spawn_blocking_threads0(handle, threads) .await .release() .await @@ -438,7 +826,7 @@ fn spawn_blocking_pool_helper_actually_works() { // because the amount is not configurable for our helper, expect the same amount as // BACKGROUND_RUNTIME using the tokio defaults would have. let rt = tokio::runtime::Builder::new_current_thread() - .max_blocking_threads(512) + .max_blocking_threads(1) .enable_all() .build() .unwrap(); @@ -448,7 +836,8 @@ fn spawn_blocking_pool_helper_actually_works() { rt.block_on(async move { // this will not return until all threads are spun up and actually executing the code // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d. - let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await; + let consumed = + SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await; println!("consumed"); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2ab7301cce..7523130f23 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -13,7 +13,6 @@ use bytes::Bytes; use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; -use futures::stream::StreamExt; use once_cell::sync::Lazy; use pageserver_api::{ key::AUX_FILES_KEY, @@ -37,6 +36,7 @@ use tracing::*; use utils::{ bin_ser::BeSer, sync::gate::{Gate, GateGuard}, + vec_map::VecMap, }; use std::ops::{Deref, Range}; @@ -2442,7 +2442,7 @@ impl Timeline { let guard = self.layers.read().await; - let resident = guard.resident_layers().map(|layer| { + let resident = guard.likely_resident_layers().map(|layer| { let last_activity_ts = layer.access_stats().latest_activity_or_now(); HeatMapLayer::new( @@ -2452,7 +2452,7 @@ impl Timeline { ) }); - let layers = resident.collect().await; + let layers = resident.collect(); Some(HeatMapTimeline::new(self.timeline_id, layers)) } @@ -4302,7 +4302,7 @@ impl Timeline { let mut max_layer_size: Option = None; let resident_layers = guard - .resident_layers() + .likely_resident_layers() .map(|layer| { let file_size = layer.layer_desc().file_size; max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); @@ -4315,8 +4315,7 @@ impl Timeline { relative_last_activity: finite_f32::FiniteF32::ZERO, } }) - .collect() - .await; + .collect(); DiskUsageEvictionInfo { max_layer_size, @@ -4618,16 +4617,15 @@ impl<'a> TimelineWriter<'a> { } } - /// Put a batch keys at the specified Lsns. + /// Put a batch of keys at the specified Lsns. /// - /// The batch should be sorted by Lsn such that it's safe - /// to roll the open layer mid batch. + /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`]. pub(crate) async fn put_batch( &mut self, - batch: Vec<(Key, Lsn, Value)>, + batch: VecMap, ctx: &RequestContext, ) -> anyhow::Result<()> { - for (key, lsn, val) in batch { + for (lsn, (key, val)) in batch { self.put(key, lsn, &val, ctx).await? } @@ -4713,7 +4711,6 @@ mod tests { .keep_resident() .await .expect("no download => no downloading errors") - .expect("should had been resident") .drop_eviction_guard(); let forever = std::time::Duration::from_secs(120); @@ -4724,7 +4721,7 @@ mod tests { let (first, second) = tokio::join!(first, second); let res = layer.keep_resident().await; - assert!(matches!(res, Ok(None)), "{res:?}"); + assert!(res.is_none(), "{res:?}"); match (first, second) { (Ok(()), Ok(())) => { diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index dd603135d2..dd769d4121 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -225,24 +225,18 @@ impl Timeline { { let guard = self.layers.read().await; let layers = guard.layer_map(); - for hist_layer in layers.iter_historic_layers() { - let hist_layer = guard.get_from_desc(&hist_layer); + for layer in layers.iter_historic_layers() { + let layer = guard.get_from_desc(&layer); // guard against eviction while we inspect it; it might be that eviction_task and // disk_usage_eviction_task both select the same layers to be evicted, and // seemingly free up double the space. both succeeding is of no consequence. - let guard = match hist_layer.keep_resident().await { - Ok(Some(l)) => l, - Ok(None) => continue, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}"); - continue; - } - }; - let last_activity_ts = hist_layer.access_stats().latest_activity_or_now(); + if !layer.is_likely_resident() { + continue; + } + + let last_activity_ts = layer.access_stats().latest_activity_or_now(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, @@ -265,9 +259,8 @@ impl Timeline { continue; } }; - let layer = guard.drop_eviction_guard(); + if no_activity_for > p.threshold { - // this could cause a lot of allocations in some cases js.spawn(async move { layer .evict_and_wait(std::time::Duration::from_secs(5)) diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index ebcdcfdb4d..d54dc1642c 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,5 +1,4 @@ use anyhow::{bail, ensure, Context, Result}; -use futures::StreamExt; use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; use tracing::trace; @@ -241,29 +240,16 @@ impl LayerManager { layer.delete_on_drop(); } - pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream + '_ { + pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { // for small layer maps, we most likely have all resident, but for larger more are likely // to be evicted assuming lots of layers correlated with longer lifespan. - let layers = self - .layer_map() - .iter_historic_layers() - .map(|desc| self.get_from_desc(&desc)); - - let layers = futures::stream::iter(layers); - - layers.filter_map(|layer| async move { - // TODO(#6028): this query does not really need to see the ResidentLayer - match layer.keep_resident().await { - Ok(Some(layer)) => Some(layer.drop_eviction_guard()), - Ok(None) => None, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(%layer, "failed to keep the layer resident: {e:#}"); - None - } - } + self.layer_map().iter_historic_layers().filter_map(|desc| { + self.layer_fmgr + .0 + .get(&desc.key()) + .filter(|l| l.is_likely_resident()) + .cloned() }) } diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index c46fd9b3ec..002bf4e2ce 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -549,9 +549,10 @@ walprop_pg_init_standalone_sync_safekeepers(void) static void walprop_sigusr2(SIGNAL_ARGS) { + int save_errno = errno; got_SIGUSR2 = true; - SetLatch(MyLatch); + errno = save_errno; } static void diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index b3a5bf873e..93a1fe85db 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -63,6 +63,7 @@ sha2 = { workspace = true, features = ["asm"] } smol_str.workspace = true smallvec.workspace = true socket2.workspace = true +subtle.workspace = true sync_wrapper.workspace = true task-local-extensions.workspace = true thiserror.workspace = true diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index f26dcb7c9a..45bbad8cb2 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -194,14 +194,7 @@ pub(crate) async fn validate_password_and_exchange( } // perform scram authentication as both client and server to validate the keys AuthSecret::Scram(scram_secret) => { - use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; - let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported()); - let outcome = crate::scram::exchange( - &scram_secret, - sasl_client, - crate::config::TlsServerEndPoint::Undefined, - ) - .await?; + let outcome = crate::scram::exchange(&scram_secret, password).await?; let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 6186ddde0d..cbb17ebcb7 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -2,14 +2,21 @@ use anyhow::{anyhow, bail}; use hyper::{Body, Request, Response, StatusCode}; use std::{convert::Infallible, net::TcpListener}; use tracing::info; -use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; +use utils::http::{ + endpoint::{self, prometheus_metrics_handler, request_span}, + error::ApiError, + json::json_response, + RouterBuilder, RouterService, +}; async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } fn make_router() -> RouterBuilder { - endpoint::make_router().get("/v1/status", status_handler) + endpoint::make_router() + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) + .get("/v1/status", status_handler) } pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 5d0340e852..9c3be73612 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -135,9 +135,10 @@ impl TestAuth for NoAuth {} struct Scram(scram::ServerSecret); impl Scram { - fn new(password: &str) -> anyhow::Result { - let secret = - scram::ServerSecret::build(password).context("failed to generate scram secret")?; + async fn new(password: &str) -> anyhow::Result { + let secret = scram::ServerSecret::build(password) + .await + .context("failed to generate scram secret")?; Ok(Scram(secret)) } @@ -284,7 +285,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new(password)?, + Scram::new(password).await?, )); let (_client, _conn) = tokio_postgres::Config::new() @@ -308,7 +309,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let (_client, _conn) = tokio_postgres::Config::new() diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index e0c2d836f4..3b760e5dab 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -148,7 +148,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let _client_err = tokio_postgres::Config::new() @@ -231,7 +231,7 @@ async fn connect_failure( let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let _client_err = tokio_postgres::Config::new() diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index 1cf8b53e11..0811416ca2 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -33,6 +33,9 @@ pub enum Error { #[error("Internal error: missing digest")] MissingBinding, + #[error("could not decode salt: {0}")] + Base64(#[from] base64::DecodeError), + #[error(transparent)] Io(#[from] io::Error), } @@ -55,6 +58,7 @@ impl ReportableError for Error { Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User, Error::BadClientMessage(_) => crate::error::ErrorKind::User, Error::MissingBinding => crate::error::ErrorKind::Service, + Error::Base64(_) => crate::error::ErrorKind::ControlPlane, Error::Io(_) => crate::error::ErrorKind::ClientDisconnect, } } diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index df4b3ec8d7..ed80675f8a 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -56,8 +56,6 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; - use crate::sasl::{Mechanism, Step}; use super::{Exchange, ServerSecret}; @@ -114,17 +112,10 @@ mod tests { } async fn run_round_trip_test(server_password: &str, client_password: &str) { - let scram_secret = ServerSecret::build(server_password).unwrap(); - let sasl_client = - ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported()); - - let outcome = super::exchange( - &scram_secret, - sasl_client, - crate::config::TlsServerEndPoint::Undefined, - ) - .await - .unwrap(); + let scram_secret = ServerSecret::build(server_password).await.unwrap(); + let outcome = super::exchange(&scram_secret, client_password.as_bytes()) + .await + .unwrap(); match outcome { crate::sasl::Outcome::Success(_) => {} diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 16575d5d98..682cbe795f 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -2,7 +2,11 @@ use std::convert::Infallible; -use postgres_protocol::authentication::sasl::ScramSha256; +use hmac::{Hmac, Mac}; +use sha2::digest::FixedOutput; +use sha2::{Digest, Sha256}; +use subtle::{Choice, ConstantTimeEq}; +use tokio::task::yield_now; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, @@ -71,47 +75,71 @@ impl<'a> Exchange<'a> { } } +// copied from +async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { + let hmac = Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); + let mut prev = hmac + .clone() + .chain_update(salt) + .chain_update(1u32.to_be_bytes()) + .finalize() + .into_bytes(); + + let mut hi = prev; + + for i in 1..iterations { + prev = hmac.clone().chain_update(prev).finalize().into_bytes(); + + for (hi, prev) in hi.iter_mut().zip(prev) { + *hi ^= prev; + } + // yield every ~250us + // hopefully reduces tail latencies + if i % 1024 == 0 { + yield_now().await + } + } + + hi.into() +} + +// copied from +async fn derive_keys(password: &[u8], salt: &[u8], iterations: u32) -> ([u8; 32], [u8; 32]) { + let salted_password = pbkdf2(password, salt, iterations).await; + + let make_key = |name| { + let key = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes") + .chain_update(name) + .finalize(); + + <[u8; 32]>::from(key.into_bytes()) + }; + + (make_key(b"Client Key"), make_key(b"Server Key")) +} + pub async fn exchange( secret: &ServerSecret, - mut client: ScramSha256, - tls_server_end_point: config::TlsServerEndPoint, + password: &[u8], ) -> sasl::Result> { - use sasl::Step::*; + let salt = base64::decode(&secret.salt_base64)?; + let (client_key, server_key) = derive_keys(password, &salt, secret.iterations).await; + let stored_key: [u8; 32] = Sha256::default() + .chain_update(client_key) + .finalize_fixed() + .into(); - let init = SaslInitial { - nonce: rand::random, - }; + // constant time to not leak partial key match + let valid = stored_key.ct_eq(&secret.stored_key.as_bytes()) + | server_key.ct_eq(&secret.server_key.as_bytes()) + | Choice::from(secret.doomed as u8); - let client_first = std::str::from_utf8(client.message()) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - let sent = match init.transition(secret, &tls_server_end_point, client_first)? { - Continue(sent, server_first) => { - // `client.update` might perform `pbkdf2(pw)`, best to spawn it in a blocking thread. - // TODO(conrad): take this code from tokio-postgres and make an async-aware pbkdf2 impl - client = tokio::task::spawn_blocking(move || { - client.update(server_first.as_bytes())?; - Ok::(client) - }) - .await - .expect("should not panic while performing password hash")?; - sent - } - Success(x, _) => match x {}, - Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), - }; - - let client_final = std::str::from_utf8(client.message()) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - let keys = match sent.transition(secret, &tls_server_end_point, client_final)? { - Success(keys, server_final) => { - client.finish(server_final.as_bytes())?; - keys - } - Continue(x, _) => match x {}, - Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), - }; - - Ok(sasl::Outcome::Success(keys)) + if valid.into() { + Ok(sasl::Outcome::Success(super::ScramKey::from(client_key))) + } else { + Ok(sasl::Outcome::Failure("password doesn't match")) + } } impl SaslInitial { diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index fb3c45816e..b46d8c3ab5 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -59,10 +59,8 @@ impl ServerSecret { /// Build a new server secret from the prerequisites. /// XXX: We only use this function in tests. #[cfg(test)] - pub fn build(password: &str) -> Option { - Self::parse(&postgres_protocol::password::scram_sha_256( - password.as_bytes(), - )) + pub async fn build(password: &str) -> Option { + Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await) } } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index a0c0c7ca4c..9ce26e6c5d 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -20,7 +20,7 @@ use std::io::Write as _; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; use tracing::{info_span, Instrument}; -use utils::http::endpoint::{request_span, ChannelWriter}; +use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter}; use crate::debug_dump::TimelineDigestRequest; use crate::receive_wal::WalReceiverState; @@ -515,6 +515,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder router .data(Arc::new(conf)) .data(auth) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1d30c45278..f8994a8dcc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2150,6 +2150,18 @@ class NeonStorageController(MetricsGetter): shards: list[dict[str, Any]] = body["shards"] return shards + def tenant_describe(self, tenant_id: TenantId): + """ + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + """ + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + return response.json() + def tenant_shard_split( self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None ) -> list[TenantShardId]: diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 99ec894106..6aebfbc99c 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -626,6 +626,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res_json = res.json() return res_json + def timeline_layer_map_info( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}") + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_checkpoint( self, tenant_id: Union[TenantId, TenantShardId], diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 48dd84fb06..be56203b26 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -1,3 +1,5 @@ +import json + import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log @@ -79,3 +81,8 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma zenbenchmark.record( "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER ) + + layer_map_path = env.repo_dir / "layer-map.json" + log.info(f"Writing layer map to {layer_map_path}") + with layer_map_path.open("w") as f: + f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index cb58c640c3..e6318aff68 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1,5 +1,6 @@ import os import time +from collections import defaultdict from typing import Dict, List, Optional, Union import pytest @@ -13,7 +14,7 @@ from fixtures.neon_fixtures import ( tenant_get_shards, ) from fixtures.remote_storage import s3_storage -from fixtures.types import Lsn, TenantShardId, TimelineId +from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until from fixtures.workload import Workload from pytest_httpserver import HTTPServer @@ -159,11 +160,20 @@ def test_sharding_split_smoke( neon_env_builder.preserve_database_files = True - env = neon_env_builder.init_start( - initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} + + env = neon_env_builder.init_configs(True) + neon_env_builder.start() + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=shard_count, + shard_stripe_size=stripe_size, + placement_policy='{"Attached": 1}', + conf=non_default_tenant_config, ) - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline workload = Workload(env, tenant_id, timeline_id, branch_name="main") workload.init() @@ -223,6 +233,14 @@ def test_sharding_split_smoke( # Before split, old shards exist assert shards_on_disk(old_shard_ids) + # Before split, we have done one reconcile for each shard + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + == shard_count + ) + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] @@ -268,39 +286,62 @@ def test_sharding_split_smoke( workload.validate() - # Check that we didn't do any spurious reconciliations. - # Total number of reconciles should have been one per original shard, plus - # one for each shard that was migrated. + # Assert on how many reconciles happened during the process. This is something of an + # implementation detail, but it is useful to detect any bugs that might generate spurious + # extra reconcile iterations. + # + # We'll have: + # - shard_count reconciles for the original setup of the tenant + # - shard_count reconciles for detaching the original secondary locations during split + # - split_shard_count reconciles during shard splitting, for setting up secondaries. + # - shard_count reconciles for the migrations we did to move child shards away from their split location + expect_reconciles = shard_count * 2 + split_shard_count + shard_count reconcile_ok = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) - assert reconcile_ok == shard_count + split_shard_count // 2 + assert reconcile_ok == expect_reconciles # Check that no cancelled or errored reconciliations occurred: this test does no # failure injection and should run clean. - assert ( - env.storage_controller.get_metric_value( - "storage_controller_reconcile_complete_total", filter={"status": "cancel"} - ) - is None + cancelled_reconciles = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "cancel"} ) - assert ( - env.storage_controller.get_metric_value( - "storage_controller_reconcile_complete_total", filter={"status": "error"} - ) - is None + errored_reconciles = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} ) + assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0 + assert errored_reconciles is not None and int(errored_reconciles) == 0 env.storage_controller.consistency_check() - # Validate pageserver state - shards_exist: list[TenantShardId] = [] - for pageserver in env.pageservers: - locations = pageserver.http_client().tenant_list_locations() - shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"]) + def get_node_shard_counts(env: NeonEnv, tenant_ids): + total: defaultdict[int, int] = defaultdict(int) + attached: defaultdict[int, int] = defaultdict(int) + for tid in tenant_ids: + for shard in env.storage_controller.tenant_describe(tid)["shards"]: + log.info( + f"{shard['tenant_shard_id']}: attached={shard['node_attached']}, secondary={shard['node_secondary']} " + ) + for node in shard["node_secondary"]: + total[int(node)] += 1 + attached[int(shard["node_attached"])] += 1 + total[int(shard["node_attached"])] += 1 - log.info(f"Shards after split: {shards_exist}") - assert len(shards_exist) == split_shard_count + return total, attached + + def check_effective_tenant_config(): + # Expect our custom tenant configs to have survived the split + for shard in env.storage_controller.tenant_describe(tenant_id)["shards"]: + node = env.get_pageserver(int(shard["node_attached"])) + config = node.http_client().tenant_config(TenantShardId.parse(shard["tenant_shard_id"])) + for k, v in non_default_tenant_config.items(): + assert config.effective_config[k] == v + + # Validate pageserver state: expect every child shard to have an attached and secondary location + (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) + assert sum(attached.values()) == split_shard_count + assert sum(total.values()) == split_shard_count * 2 + check_effective_tenant_config() # Ensure post-split pageserver locations survive a restart (i.e. the child shards # correctly wrote config to disk, and the storage controller responds correctly @@ -309,13 +350,11 @@ def test_sharding_split_smoke( pageserver.stop() pageserver.start() - shards_exist = [] - for pageserver in env.pageservers: - locations = pageserver.http_client().tenant_list_locations() - shards_exist.extend(TenantShardId.parse(s[0]) for s in locations["tenant_shards"]) - - log.info("Shards after restart: {shards_exist}") - assert len(shards_exist) == split_shard_count + # Validate pageserver state: expect every child shard to have an attached and secondary location + (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) + assert sum(attached.values()) == split_shard_count + assert sum(total.values()) == split_shard_count * 2 + check_effective_tenant_config() workload.validate() @@ -721,9 +760,16 @@ def test_sharding_split_failures( initial_shard_count = 2 split_shard_count = 4 - env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count) - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + # Create a tenant with secondary locations enabled + env.neon_cli.create_tenant( + tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' + ) env.storage_controller.allowed_errors.extend( [ @@ -736,6 +782,8 @@ def test_sharding_split_failures( ".*failpoint.*", # Node offline cases will fail to send requests ".*Reconcile error: receive body: error sending request for url.*", + # Node offline cases will fail inside reconciler when detaching secondaries + ".*Reconcile error on shard.*: receive body: error sending request for url.*", ] ) @@ -773,7 +821,8 @@ def test_sharding_split_failures( # will have succeeded: the net result should be to return to a clean state, including # detaching any child shards. def assert_rolled_back(exclude_ps_id=None) -> None: - count = 0 + secondary_count = 0 + attached_count = 0 for ps in env.pageservers: if exclude_ps_id is not None and ps.id == exclude_ps_id: continue @@ -781,13 +830,25 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) - log.info(f"Shard {tenant_shard_id} seen on node {ps.id}") + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == initial_shard_count - count += 1 - assert count == initial_shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + + if exclude_ps_id is not None: + # For a node failure case, we expect there to be a secondary location + # scheduled on the offline node, so expect one fewer secondary in total + assert secondary_count == initial_shard_count - 1 + else: + assert secondary_count == initial_shard_count + + assert attached_count == initial_shard_count def assert_split_done(exclude_ps_id=None) -> None: - count = 0 + secondary_count = 0 + attached_count = 0 for ps in env.pageservers: if exclude_ps_id is not None and ps.id == exclude_ps_id: continue @@ -795,10 +856,14 @@ def test_sharding_split_failures( locations = ps.http_client().tenant_list_locations()["tenant_shards"] for loc in locations: tenant_shard_id = TenantShardId.parse(loc[0]) - log.info(f"Shard {tenant_shard_id} seen on node {ps.id}") + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") assert tenant_shard_id.shard_count == split_shard_count - count += 1 - assert count == split_shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + assert attached_count == split_shard_count + assert secondary_count == split_shard_count def finish_split(): # Having failed+rolled back, we should be able to split again diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py index a6b0f76c96..b7488cadd6 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_sharding_service.py @@ -23,7 +23,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, s3_storage -from fixtures.types import TenantId, TimelineId +from fixtures.types import TenantId, TenantShardId, TimelineId from fixtures.utils import run_pg_bench_small, wait_until from mypy_boto3_s3.type_defs import ( ObjectTypeDef, @@ -948,3 +948,65 @@ def test_sharding_service_heartbeats( env.storage_controller.consistency_check() wait_until(10, 1, storage_controller_consistent) + + +def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder): + """ + Exercise the behavior of the /re-attach endpoint on pageserver startup when + pageservers have a mixture of attached and secondary locations + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # We'll have two tenants. + tenant_a = TenantId.generate() + env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}') + tenant_b = TenantId.generate() + env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}') + + # Each pageserver will have one attached and one secondary location + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_a, 0, 0), env.pageservers[0].id + ) + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_b, 0, 0), env.pageservers[1].id + ) + + # Hard-fail a pageserver + victim_ps = env.pageservers[1] + survivor_ps = env.pageservers[0] + victim_ps.stop(immediate=True) + + # Heatbeater will notice it's offline, and consequently attachments move to the other pageserver + def failed_over(): + locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"] + log.info(f"locations: {locations}") + assert len(locations) == 2 + assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations) + + # We could pre-empty this by configuring the node to Offline, but it's preferable to test + # the realistic path we would take when a node restarts uncleanly. + # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local + wait_until(30, 1, failed_over) + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + # Restart the failed pageserver + victim_ps.start() + + # We expect that the re-attach call correctly tipped off the pageserver that its locations + # are all secondaries now. + locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(locations) == 2 + assert all(loc[1]["mode"] == "Secondary" for loc in locations) + + # We expect that this situation resulted from the re_attach call, and not any explicit + # Reconciler runs: assert that the reconciliation count has not gone up since we restarted. + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + assert reconciles_after_restart == reconciles_before_restart