From b719d58863f90beb46218d1167e4236b05910924 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 17 Sep 2024 09:25:42 +0100 Subject: [PATCH] storcon: forward requests from stepped down instance to the current leader (#8954) ## Problem It turns out that we can't rely on external orchestration to promptly route trafic to the new leader. This is downtime inducing. Forwarding provides a safe way out. ## Safety We forward when: 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] 2. Current instance is in [`LeadershipStatus::SteppedDown`] state 3. There is a leader in the database to forward to 4. Leader from step (3) is not the current instance If a storcon instance is persisted in the database, then we know that it is the current leader. There's one exception: time between handling step-down request and the new leader updating the database. Let's treat the happy case first. The stepped down node does not produce any side effects, since all request handling happens on the leader. As for the edge case, we are guaranteed to always have a maximum of two running instances. Hence, if we are in the edge case scenario the leader persisted in the database is the stepped down instance that received the request. Condition (4) above covers this scenario. ## Summary of changes * Conversion utilities for reqwest <-> hyper. I'm not happy with these, but I don't see a better way. Open to suggestions. * Add request forwarding logic * Update each request handler. Again, not happy with this. If anyone knows a nice to wrap the handlers, lmk. Me and Joonas tried :/ * Update each handler to maybe forward * Tweak tests to showcase new behaviour --- storage_controller/src/http.rs | 620 +++++++++++++++++- .../regress/test_storage_controller.py | 22 +- 2 files changed, 607 insertions(+), 35 deletions(-) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index a6638f5191..1745bf5575 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1,10 +1,11 @@ +use crate::http; use crate::metrics::{ HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, METRICS_REGISTRY, }; use crate::persistence::SafekeeperPersistence; use crate::reconciler::ReconcileError; -use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; +use crate::service::{LeadershipStatus, Service, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; use futures::Future; use hyper::header::CONTENT_TYPE; @@ -22,6 +23,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::TenantShardId; use pageserver_client::{mgmt_api, BlockUnblock}; +use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; @@ -87,9 +89,16 @@ fn get_state(request: &Request) -> &HttpState { } /// Pageserver calls into this on startup, to learn which tenants it should attach -async fn handle_re_attach(mut req: Request) -> Result, ApiError> { +async fn handle_re_attach(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let reattach_req = json_request::(&mut req).await?; let state = get_state(&req); json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?) @@ -97,9 +106,16 @@ async fn handle_re_attach(mut req: Request) -> Result, ApiE /// Pageserver calls into this before doing deletions, to confirm that it still /// holds the latest generation for the tenants with deletions enqueued -async fn handle_validate(mut req: Request) -> Result, ApiError> { +async fn handle_validate(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let validate_req = json_request::(&mut req).await?; let state = get_state(&req); json_response(StatusCode::OK, state.service.validate(validate_req).await?) @@ -108,9 +124,16 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr /// Call into this before attaching a tenant to a pageserver, to acquire a generation number /// (in the real control plane this is unnecessary, because the same program is managing /// generation numbers and doing attachments). -async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { +async fn handle_attach_hook(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let attach_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -124,9 +147,16 @@ async fn handle_attach_hook(mut req: Request) -> Result, Ap ) } -async fn handle_inspect(mut req: Request) -> Result, ApiError> { +async fn handle_inspect(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let inspect_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -136,10 +166,17 @@ async fn handle_inspect(mut req: Request) -> Result, ApiErr async fn handle_tenant_create( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let create_req = json_request::(&mut req).await?; json_response( @@ -150,11 +187,18 @@ async fn handle_tenant_create( async fn handle_tenant_location_config( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let config_req = json_request::(&mut req).await?; json_response( StatusCode::OK, @@ -166,10 +210,17 @@ async fn handle_tenant_location_config( async fn handle_tenant_config_set( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let config_req = json_request::(&mut req).await?; json_response(StatusCode::OK, service.tenant_config_set(config_req).await?) @@ -182,16 +233,30 @@ async fn handle_tenant_config_get( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?) } async fn handle_tenant_time_travel_remote_storage( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let time_travel_req = json_request::(&mut req).await?; let timestamp_raw = must_get_query_param(&req, "travel_to")?; @@ -232,6 +297,13 @@ async fn handle_tenant_secondary_download( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; json_response(map_reqwest_hyper_status(status)?, progress) } @@ -243,6 +315,13 @@ async fn handle_tenant_delete( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + let status_code = service .tenant_delete(tenant_id) .await @@ -258,11 +337,18 @@ async fn handle_tenant_delete( async fn handle_tenant_timeline_create( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let create_req = json_request::(&mut req).await?; json_response( StatusCode::CREATED, @@ -277,9 +363,16 @@ async fn handle_tenant_timeline_delete( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; // For timeline deletions, which both implement an "initially return 202, then 404 once // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. @@ -337,12 +430,19 @@ async fn handle_tenant_timeline_delete( async fn handle_tenant_timeline_archival_config( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; let create_req = json_request::(&mut req).await?; @@ -358,9 +458,16 @@ async fn handle_tenant_timeline_detach_ancestor( req: Request, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; - let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; let res = service .tenant_timeline_detach_ancestor(tenant_id, timeline_id) @@ -393,6 +500,13 @@ async fn handle_tenant_timeline_passthrough( let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let Some(path) = req.uri().path_and_query() else { // This should never happen, our request router only calls us if there is a path return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); @@ -460,9 +574,17 @@ async fn handle_tenant_locate( service: Arc, req: Request, ) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::Admin)?; - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) } @@ -473,6 +595,14 @@ async fn handle_tenant_describe( check_permissions(&req, Scope::Scrubber)?; let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) } @@ -482,12 +612,26 @@ async fn handle_tenant_list( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, service.tenant_list()) } -async fn handle_node_register(mut req: Request) -> Result, ApiError> { +async fn handle_node_register(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let register_req = json_request::(&mut req).await?; let state = get_state(&req); state.service.node_register(register_req).await?; @@ -497,6 +641,13 @@ async fn handle_node_register(mut req: Request) -> Result, async fn handle_node_list(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let nodes = state.service.node_list().await?; let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); @@ -507,6 +658,13 @@ async fn handle_node_list(req: Request) -> Result, ApiError async fn handle_node_drop(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; json_response(StatusCode::OK, state.service.node_drop(node_id).await?) @@ -515,14 +673,28 @@ async fn handle_node_drop(req: Request) -> Result, ApiError async fn handle_node_delete(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; json_response(StatusCode::OK, state.service.node_delete(node_id).await?) } -async fn handle_node_configure(mut req: Request) -> Result, ApiError> { +async fn handle_node_configure(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let node_id: NodeId = parse_request_param(&req, "node_id")?; let config_req = json_request::(&mut req).await?; if node_id != config_req.node_id { @@ -548,6 +720,13 @@ async fn handle_node_configure(mut req: Request) -> Result, async fn handle_node_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -570,6 +749,13 @@ async fn handle_node_shards(req: Request) -> Result, ApiErr async fn handle_get_leader(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let leader = state.service.get_leader().await.map_err(|err| { ApiError::InternalServerError(anyhow::anyhow!( @@ -583,6 +769,13 @@ async fn handle_get_leader(req: Request) -> Result, ApiErro async fn handle_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -594,6 +787,13 @@ async fn handle_node_drain(req: Request) -> Result, ApiErro async fn handle_cancel_node_drain(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -605,6 +805,13 @@ async fn handle_cancel_node_drain(req: Request) -> Result, async fn handle_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -616,6 +823,13 @@ async fn handle_node_fill(req: Request) -> Result, ApiError async fn handle_cancel_node_fill(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let node_id: NodeId = parse_request_param(&req, "node_id")?; @@ -624,9 +838,16 @@ async fn handle_cancel_node_fill(req: Request) -> Result, A json_response(StatusCode::ACCEPTED, ()) } -async fn handle_metadata_health_update(mut req: Request) -> Result, ApiError> { +async fn handle_metadata_health_update(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Scrubber)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let update_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -640,6 +861,13 @@ async fn handle_metadata_health_list_unhealthy( ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?; @@ -652,10 +880,17 @@ async fn handle_metadata_health_list_unhealthy( } async fn handle_metadata_health_list_outdated( - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let list_outdated_req = json_request::(&mut req).await?; let state = get_state(&req); let health_records = state @@ -671,10 +906,17 @@ async fn handle_metadata_health_list_outdated( async fn handle_tenant_shard_split( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let split_req = json_request::(&mut req).await?; @@ -686,10 +928,17 @@ async fn handle_tenant_shard_split( async fn handle_tenant_shard_migrate( service: Arc, - mut req: Request, + req: Request, ) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; let migrate_req = json_request::(&mut req).await?; json_response( @@ -700,9 +949,16 @@ async fn handle_tenant_shard_migrate( ) } -async fn handle_tenant_update_policy(mut req: Request) -> Result, ApiError> { +async fn handle_tenant_update_policy(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let update_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -716,9 +972,16 @@ async fn handle_tenant_update_policy(mut req: Request) -> Result) -> Result, ApiError> { +async fn handle_update_preferred_azs(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let azs_req = json_request::(&mut req).await?; let state = get_state(&req); @@ -731,23 +994,46 @@ async fn handle_update_preferred_azs(mut req: Request) -> Result) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.step_down().await) } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) } async fn handle_tenant_import(req: Request) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; check_permissions(&req, Scope::PageServerApi)?; + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response( @@ -759,6 +1045,13 @@ async fn handle_tenant_import(req: Request) -> Result, ApiE async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.tenants_dump() } @@ -766,6 +1059,13 @@ async fn handle_tenants_dump(req: Request) -> Result, ApiEr async fn handle_scheduler_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.scheduler_dump() } @@ -773,6 +1073,13 @@ async fn handle_scheduler_dump(req: Request) -> Result, Api async fn handle_consistency_check(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.consistency_check().await?) @@ -781,19 +1088,40 @@ async fn handle_consistency_check(req: Request) -> Result, async fn handle_reconcile_all(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); json_response(StatusCode::OK, state.service.reconcile_all_now().await?) } /// Status endpoint is just used for checking that our HTTP listener is up -async fn handle_status(_req: Request) -> Result, ApiError> { +async fn handle_status(req: Request) -> Result, ApiError> { + match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(_req) => {} + }; + json_response(StatusCode::OK, ()) } /// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling /// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe. async fn handle_ready(req: Request) -> Result, ApiError> { + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); if state.service.startup_complete.is_ready() { json_response(StatusCode::OK, ()) @@ -816,6 +1144,13 @@ async fn handle_get_safekeeper(req: Request) -> Result, Api let id = parse_request_param::(&req, "id")?; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let res = state.service.get_safekeeper(id).await; @@ -847,6 +1182,13 @@ async fn handle_upsert_safekeeper(mut req: Request) -> Result { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); state.service.upsert_safekeeper(body).await?; @@ -925,10 +1267,7 @@ pub fn prologue_leadership_status_check_middleware< let allowed_routes = match leadership_status { LeadershipStatus::Leader => AllowedRoutes::All, - LeadershipStatus::SteppedDown => { - // TODO: does it make sense to allow /status here? - AllowedRoutes::Some(["/control/v1/step_down", "/status", "/metrics"].to_vec()) - } + LeadershipStatus::SteppedDown => AllowedRoutes::All, LeadershipStatus::Candidate => { AllowedRoutes::Some(["/ready", "/status", "/metrics"].to_vec()) } @@ -1005,6 +1344,13 @@ fn epilogue_metrics_middleware pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + let state = get_state(&req); let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); let response = Response::builder() @@ -1032,6 +1378,220 @@ where request_span(request, handler).await } +enum ForwardOutcome { + Forwarded(Result, ApiError>), + NotForwarded(Request), +} + +/// Potentially forward the request to the current storage controler leader. +/// More specifically we forward when: +/// 1. Request is not one of ["/control/v1/step_down", "/status", "/ready", "/metrics"] +/// 2. Current instance is in [`LeadershipStatus::SteppedDown`] state +/// 3. There is a leader in the database to forward to +/// 4. Leader from step (3) is not the current instance +/// +/// Why forward? +/// It turns out that we can't rely on external orchestration to promptly route trafic to the +/// new leader. This is downtime inducing. Forwarding provides a safe way out. +/// +/// Why is it safe? +/// If a storcon instance is persisted in the database, then we know that it is the current leader. +/// There's one exception: time between handling step-down request and the new leader updating the +/// database. +/// +/// Let's treat the happy case first. The stepped down node does not produce any side effects, +/// since all request handling happens on the leader. +/// +/// As for the edge case, we are guaranteed to always have a maximum of two running instances. +/// Hence, if we are in the edge case scenario the leader persisted in the database is the +/// stepped down instance that received the request. Condition (4) above covers this scenario. +async fn maybe_forward(req: Request) -> ForwardOutcome { + const NOT_FOR_FORWARD: [&str; 4] = ["/control/v1/step_down", "/status", "/ready", "/metrics"]; + + let uri = req.uri().to_string(); + let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + + let state = get_state(&req); + let leadership_status = state.service.get_leadership_status(); + + if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward { + return ForwardOutcome::NotForwarded(req); + } + + let leader = state.service.get_leader().await; + let leader = { + match leader { + Ok(Some(leader)) => leader, + Ok(None) => { + return ForwardOutcome::Forwarded(Err(ApiError::ResourceUnavailable( + "No leader to forward to while in stepped down state".into(), + ))); + } + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( + anyhow::anyhow!( + "Failed to get leader for forwarding while in stepped down state: {err}" + ), + ))); + } + } + }; + + let cfg = state.service.get_config(); + if let Some(ref self_addr) = cfg.address_for_peers { + let leader_addr = match Uri::from_str(leader.address.as_str()) { + Ok(uri) => uri, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError( + anyhow::anyhow!( + "Failed to parse leader uri for forwarding while in stepped down state: {err}" + ), + ))); + } + }; + + if *self_addr == leader_addr { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Leader is stepped down instance" + )))); + } + } + + tracing::info!("Forwarding {} to leader at {}", uri, leader.address); + + // Use [`RECONCILE_TIMEOUT`] as the max amount of time a request should block for and + // include some leeway to get the timeout for proxied requests. + const PROXIED_REQUEST_TIMEOUT: Duration = Duration::from_secs(RECONCILE_TIMEOUT.as_secs() + 10); + let client = reqwest::ClientBuilder::new() + .timeout(PROXIED_REQUEST_TIMEOUT) + .build(); + let client = match client { + Ok(client) => client, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to build leader client for forwarding while in stepped down state: {err}" + )))); + } + }; + + let request: reqwest::Request = match convert_request(req, &client, leader.address).await { + Ok(r) => r, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to convert request for forwarding while in stepped down state: {err}" + )))); + } + }; + + let response = match client.execute(request).await { + Ok(r) => r, + Err(err) => { + return ForwardOutcome::Forwarded(Err(ApiError::InternalServerError(anyhow::anyhow!( + "Failed to forward while in stepped down state: {err}" + )))); + } + }; + + ForwardOutcome::Forwarded(convert_response(response).await) +} + +/// Convert a [`reqwest::Response`] to a [hyper::Response`] by passing through +/// a stable representation (string, bytes or integer) +/// +/// Ideally, we would not have to do this since both types use the http crate +/// under the hood. However, they use different versions of the crate and keeping +/// second order dependencies in sync is difficult. +async fn convert_response(resp: reqwest::Response) -> Result, ApiError> { + use std::str::FromStr; + + let mut builder = hyper::Response::builder().status(resp.status().as_u16()); + for (key, value) in resp.headers().into_iter() { + let key = hyper::header::HeaderName::from_str(key.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + })?; + + let value = hyper::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + })?; + + builder = builder.header(key, value); + } + + let body = http::Body::wrap_stream(resp.bytes_stream()); + + builder.body(body).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Response conversion failed: {err}")) + }) +} + +/// Convert a [`reqwest::Request`] to a [hyper::Request`] by passing through +/// a stable representation (string, bytes or integer) +/// +/// See [`convert_response`] for why we are doing it this way. +async fn convert_request( + req: hyper::Request, + client: &reqwest::Client, + to_address: String, +) -> Result { + use std::str::FromStr; + + let (parts, body) = req.into_parts(); + let method = reqwest::Method::from_str(parts.method.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let path_and_query = parts.uri.path_and_query().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "Request conversion failed: no path and query" + )) + })?; + + let uri = reqwest::Url::from_str( + format!( + "{}{}", + to_address.trim_end_matches("/"), + path_and_query.as_str() + ) + .as_str(), + ) + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let mut headers = reqwest::header::HeaderMap::new(); + for (key, value) in parts.headers.into_iter() { + let key = match key { + Some(k) => k, + None => { + continue; + } + }; + + let key = reqwest::header::HeaderName::from_str(key.as_str()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + let value = reqwest::header::HeaderValue::from_bytes(value.as_bytes()).map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + headers.insert(key, value); + } + + let body = hyper::body::to_bytes(body).await.map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + })?; + + client + .request(method, uri) + .headers(headers) + .body(body) + .build() + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!("Request conversion failed: {err}")) + }) +} + pub fn make_router( service: Arc, auth: Option>, diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 2e21f8fb46..2d72dbb2df 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2048,8 +2048,11 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): # Make a change to the tenant config to trigger a slow reconcile virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None) - env.storage_controller.allowed_errors.append( - ".*Accepted configuration update but reconciliation failed.*" + env.storage_controller.allowed_errors.extend( + [ + ".*Accepted configuration update but reconciliation failed.*", + ".*Leader is stepped down instance", + ] ) observed_state = env.storage_controller.step_down() @@ -2072,9 +2075,9 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder): assert "compaction_threshold" in ps_tenant_conf.effective_config assert ps_tenant_conf.effective_config["compaction_threshold"] == 5 - # Validate that the storcon is not replying to the usual requests - # once it has stepped down. - with pytest.raises(StorageControllerApiException, match="stepped_down"): + # Validate that the storcon attempts to forward the request, but stops. + # when it realises it is still the current leader. + with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"): env.storage_controller.tenant_list() # Validate that we can step down multiple times and the observed state @@ -2221,6 +2224,15 @@ def test_storage_controller_leadership_transfer( env.storage_controller.wait_until_ready() env.storage_controller.consistency_check() + if not step_down_times_out: + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_list() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" + if step_down_times_out: env.storage_controller.allowed_errors.extend( [