From 9787227c35d6e79b8b8328dc39b1a5592441fcc4 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 2 Jun 2023 08:28:13 -0400 Subject: [PATCH] Shield HTTP request handlers from async cancellations. (#4314) We now spawn a new task for every HTTP request, and wait on the JoinHandle. If Hyper drops the Future, the spawned task will keep running. This protects the rest of the pageserver code from unexpected async cancellations. This creates a CancellationToken for each request and passes it to the handler function. If the HTTP request is dropped by the client, the CancellationToken is signaled. None of the handler functions make use for the CancellationToken currently, but they now they could. The CancellationToken arguments also work like documentation. When you're looking at a function signature and you see that it takes a CancellationToken as argument, it's a nice hint that the function might run for a long time, and won't be async cancelled. The default assumption in the pageserver is now that async functions are not cancellation-safe anyway, unless explictly marked as such, but this is a nice extra reminder. Spawning a task for each request is OK from a performance point of view because spawning is very cheap in Tokio, and none of our HTTP requests are very performance critical anyway. Fixes issue #3478 --- docs/pageserver-thread-mgmt.md | 4 +- libs/utils/src/http/endpoint.rs | 6 + pageserver/src/http/routes.rs | 319 ++++++++++++++------ pageserver/src/tenant/mgr.rs | 1 - test_runner/regress/test_timeline_delete.py | 24 +- 5 files changed, 250 insertions(+), 104 deletions(-) diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index 0cc897f154..b911933528 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -52,9 +52,7 @@ completion, or shield the rest of the code from surprise cancellations by spawning a separate task. The code that handles incoming HTTP requests, for example, spawns a separate task for each request, because Hyper will drop the request-handling Future if the HTTP -connection is lost. (FIXME: our HTTP handlers do not do that -currently, but we should fix that. See [issue -3478](https://github.com/neondatabase/neon/issues/3478)). +connection is lost. #### How to cancel, then? diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index db3642b507..7cb96d9094 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -40,6 +40,12 @@ struct RequestId(String); /// /// This also handles errors, logging them and converting them to an HTTP error response. /// +/// NB: If the client disconnects, Hyper will drop the Future, without polling it to +/// completion. In other words, the handler must be async cancellation safe! request_span +/// prints a warning to the log when that happens, so that you have some trace of it in +/// the log. +/// +/// /// There could be other ways to implement similar functionality: /// /// * procmacros placed on top of all handler methods diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 61028e23fe..22dedbe5b2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,3 +1,6 @@ +//! +//! Management HTTP API +//! use std::collections::HashMap; use std::sync::Arc; @@ -46,7 +49,6 @@ use utils::{ }; // Imports only used for testing APIs -#[cfg(feature = "testing")] use super::models::ConfigureFailpointsRequest; struct State { @@ -290,13 +292,19 @@ fn build_timeline_info_common( } // healthcheck handler -async fn status_handler(request: Request) -> Result, ApiError> { +async fn status_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&request, None)?; let config = get_config(&request); json_response(StatusCode::OK, StatusResponse { id: config.id }) } -async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { +async fn timeline_create_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let request_data: TimelineCreateRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_id))?; @@ -332,7 +340,10 @@ async fn timeline_create_handler(mut request: Request) -> Result) -> Result, ApiError> { +async fn timeline_list_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; @@ -366,7 +377,10 @@ async fn timeline_list_handler(request: Request) -> Result, json_response(StatusCode::OK, response_data) } -async fn timeline_detail_handler(request: Request) -> Result, ApiError> { +async fn timeline_detail_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = @@ -400,7 +414,10 @@ async fn timeline_detail_handler(request: Request) -> Result) -> Result, ApiError> { +async fn get_lsn_by_timestamp_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -424,7 +441,10 @@ async fn get_lsn_by_timestamp_handler(request: Request) -> Result) -> Result, ApiError> { +async fn tenant_attach_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -460,7 +480,10 @@ async fn tenant_attach_handler(mut request: Request) -> Result) -> Result, ApiError> { +async fn timeline_delete_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -474,7 +497,10 @@ async fn timeline_delete_handler(request: Request) -> Result) -> Result, ApiError> { +async fn tenant_detach_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let detach_ignored: Option = parse_query_param(&request, "detach_ignored")?; @@ -488,7 +514,10 @@ async fn tenant_detach_handler(request: Request) -> Result, json_response(StatusCode::OK, ()) } -async fn tenant_load_handler(request: Request) -> Result, ApiError> { +async fn tenant_load_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -508,7 +537,10 @@ async fn tenant_load_handler(request: Request) -> Result, A json_response(StatusCode::ACCEPTED, ()) } -async fn tenant_ignore_handler(request: Request) -> Result, ApiError> { +async fn tenant_ignore_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -521,7 +553,10 @@ async fn tenant_ignore_handler(request: Request) -> Result, json_response(StatusCode::OK, ()) } -async fn tenant_list_handler(request: Request) -> Result, ApiError> { +async fn tenant_list_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&request, None)?; let response_data = mgr::list_tenants() @@ -541,7 +576,10 @@ async fn tenant_list_handler(request: Request) -> Result, A json_response(StatusCode::OK, response_data) } -async fn tenant_status(request: Request) -> Result, ApiError> { +async fn tenant_status( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -581,7 +619,10 @@ async fn tenant_status(request: Request) -> Result, ApiErro /// Note: we don't update the cached size and prometheus metric here. /// The retention period might be different, and it's nice to have a method to just calculate it /// without modifying anything anyway. -async fn tenant_size_handler(request: Request) -> Result, ApiError> { +async fn tenant_size_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let inputs_only: Option = parse_query_param(&request, "inputs_only")?; @@ -646,7 +687,10 @@ async fn tenant_size_handler(request: Request) -> Result, A ) } -async fn layer_map_info_handler(request: Request) -> Result, ApiError> { +async fn layer_map_info_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = @@ -660,7 +704,10 @@ async fn layer_map_info_handler(request: Request) -> Result json_response(StatusCode::OK, layer_map_info) } -async fn layer_download_handler(request: Request) -> Result, ApiError> { +async fn layer_download_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -683,7 +730,10 @@ async fn layer_download_handler(request: Request) -> Result } } -async fn evict_timeline_layer_handler(request: Request) -> Result, ApiError> { +async fn evict_timeline_layer_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -761,7 +811,10 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } -async fn tenant_create_handler(mut request: Request) -> Result, ApiError> { +async fn tenant_create_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let request_data: TenantCreateRequest = json_request(&mut request).await?; let target_tenant_id = request_data.new_tenant_id; check_permission(&request, None)?; @@ -808,7 +861,10 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { +async fn get_tenant_config_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -834,6 +890,7 @@ async fn get_tenant_config_handler(request: Request) -> Result, + _cancel: CancellationToken, ) -> Result, ApiError> { let request_data: TenantConfigRequest = json_request(&mut request).await?; let tenant_id = request_data.tenant_id; @@ -851,8 +908,10 @@ async fn update_tenant_config_handler( } /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`]. -#[cfg(feature = "testing")] -async fn handle_tenant_break(r: Request) -> Result, ApiError> { +async fn handle_tenant_break( + r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) @@ -864,8 +923,10 @@ async fn handle_tenant_break(r: Request) -> Result, ApiErro json_response(StatusCode::OK, ()) } -#[cfg(feature = "testing")] -async fn failpoints_handler(mut request: Request) -> Result, ApiError> { +async fn failpoints_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { if !fail::has_failpoints() { return Err(ApiError::BadRequest(anyhow!( "Cannot manage failpoints because pageserver was compiled without failpoints support" @@ -898,7 +959,10 @@ async fn failpoints_handler(mut request: Request) -> Result } // Run GC immediately on given timeline. -async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { +async fn timeline_gc_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -917,8 +981,10 @@ async fn timeline_gc_handler(mut request: Request) -> Result) -> Result, ApiError> { +async fn timeline_compact_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -939,8 +1005,10 @@ async fn timeline_compact_handler(request: Request) -> Result) -> Result, ApiError> { +async fn timeline_checkpoint_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_id))?; @@ -964,6 +1032,7 @@ async fn timeline_checkpoint_handler(request: Request) -> Result, + _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -979,6 +1048,7 @@ async fn timeline_download_remote_layers_handler_post( async fn timeline_download_remote_layers_handler_get( request: Request, + _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -1002,7 +1072,10 @@ async fn active_timeline_of_active_tenant( .map_err(ApiError::NotFound) } -async fn always_panic_handler(req: Request) -> Result, ApiError> { +async fn always_panic_handler( + req: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook(). // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it. // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic. @@ -1013,7 +1086,10 @@ async fn always_panic_handler(req: Request) -> Result, ApiE json_response(StatusCode::NO_CONTENT, ()) } -async fn disk_usage_eviction_run(mut r: Request) -> Result, ApiError> { +async fn disk_usage_eviction_run( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { check_permission(&r, None)?; #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] @@ -1103,8 +1179,10 @@ async fn handler_404(_: Request) -> Result, ApiError> { ) } -#[cfg(feature = "testing")] -async fn post_tracing_event_handler(mut r: Request) -> Result, ApiError> { +async fn post_tracing_event_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { #[derive(Debug, serde::Deserialize)] #[serde(rename_all = "lowercase")] enum Level { @@ -1134,6 +1212,85 @@ async fn post_tracing_event_handler(mut r: Request) -> Result(request: Request, handler: H) -> Result, ApiError> +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, +{ + // Spawn a new task to handle the request, to protect the handler from unexpected + // async cancellations. Most pageserver functions are not async cancellation safe. + // We arm a drop-guard, so that if Hyper drops the Future, we signal the task + // with the cancellation token. + let token = CancellationToken::new(); + let cancel_guard = token.clone().drop_guard(); + let result = request_span(request, move |r| async { + let handle = tokio::spawn( + async { + let token_cloned = token.clone(); + let result = handler(r, token).await; + if token_cloned.is_cancelled() { + info!("Cancelled request finished"); + } + result + } + .in_current_span(), + ); + + match handle.await { + Ok(result) => result, + Err(e) => { + // The handler task panicked. We have a global panic handler that logs the + // panic with its backtrace, so no need to log that here. Only log a brief + // message to make it clear that we returned the error to the client. + error!("HTTP request handler task panicked: {e:#}"); + + // Don't return an Error here, because then fallback error handler that was + // installed in make_router() will print the error. Instead, construct the + // HTTP error response and return that. + Ok( + ApiError::InternalServerError(anyhow!("HTTP request handler task panicked")) + .into_response(), + ) + } + } + }) + .await; + + cancel_guard.disarm(); + + result +} + +/// Like api_handler, but returns an error response if the server is built without +/// the 'testing' feature. +async fn testing_api_handler( + desc: &str, + request: Request, + handler: H, +) -> Result, ApiError> +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, +{ + if cfg!(feature = "testing") { + api_handler(request, handler).await + } else { + std::future::ready(Err(ApiError::BadRequest(anyhow!( + "Cannot {desc} because pageserver was compiled without testing APIs", + )))) + .await + } +} + pub fn make_router( conf: &'static PageServerConf, launch_ts: &'static LaunchTimestamp, @@ -1163,26 +1320,6 @@ pub fn make_router( .expect("construct launch timestamp header middleware"), ); - macro_rules! testing_api { - ($handler_desc:literal, $handler:path $(,)?) => {{ - #[cfg(not(feature = "testing"))] - async fn cfg_disabled(_req: Request) -> Result, ApiError> { - Err(ApiError::BadRequest(anyhow!(concat!( - "Cannot ", - $handler_desc, - " because pageserver was compiled without testing APIs", - )))) - } - - #[cfg(feature = "testing")] - let handler = $handler; - #[cfg(not(feature = "testing"))] - let handler = cfg_disabled; - - move |r| request_span(r, handler) - }}; - } - Ok(router .data(Arc::new( State::new( @@ -1194,92 +1331,88 @@ pub fn make_router( ) .context("Failed to initialize router state")?, )) - .get("/v1/status", |r| request_span(r, status_handler)) - .put( - "/v1/failpoints", - testing_api!("manage failpoints", failpoints_handler), - ) - .get("/v1/tenant", |r| request_span(r, tenant_list_handler)) - .post("/v1/tenant", |r| request_span(r, tenant_create_handler)) - .get("/v1/tenant/:tenant_id", |r| request_span(r, tenant_status)) + .get("/v1/status", |r| api_handler(r, status_handler)) + .put("/v1/failpoints", |r| { + testing_api_handler("manage failpoints", r, failpoints_handler) + }) + .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) + .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) + .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) .get("/v1/tenant/:tenant_id/synthetic_size", |r| { - request_span(r, tenant_size_handler) + api_handler(r, tenant_size_handler) }) .put("/v1/tenant/config", |r| { - request_span(r, update_tenant_config_handler) + api_handler(r, update_tenant_config_handler) }) .get("/v1/tenant/:tenant_id/config", |r| { - request_span(r, get_tenant_config_handler) + api_handler(r, get_tenant_config_handler) }) .get("/v1/tenant/:tenant_id/timeline", |r| { - request_span(r, timeline_list_handler) + api_handler(r, timeline_list_handler) }) .post("/v1/tenant/:tenant_id/timeline", |r| { - request_span(r, timeline_create_handler) + api_handler(r, timeline_create_handler) }) .post("/v1/tenant/:tenant_id/attach", |r| { - request_span(r, tenant_attach_handler) + api_handler(r, tenant_attach_handler) }) .post("/v1/tenant/:tenant_id/detach", |r| { - request_span(r, tenant_detach_handler) + api_handler(r, tenant_detach_handler) }) .post("/v1/tenant/:tenant_id/load", |r| { - request_span(r, tenant_load_handler) + api_handler(r, tenant_load_handler) }) .post("/v1/tenant/:tenant_id/ignore", |r| { - request_span(r, tenant_ignore_handler) + api_handler(r, tenant_ignore_handler) }) .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - request_span(r, timeline_detail_handler) + api_handler(r, timeline_detail_handler) }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", - |r| request_span(r, get_lsn_by_timestamp_handler), + |r| api_handler(r, get_lsn_by_timestamp_handler), ) .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| { - request_span(r, timeline_gc_handler) + api_handler(r, timeline_gc_handler) + }) + .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| { + testing_api_handler("run timeline compaction", r, timeline_compact_handler) }) - .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", - testing_api!("run timeline compaction", timeline_compact_handler), - ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", - testing_api!("run timeline checkpoint", timeline_checkpoint_handler), + |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), ) .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - |r| request_span(r, timeline_download_remote_layers_handler_post), + |r| api_handler(r, timeline_download_remote_layers_handler_post), ) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", - |r| request_span(r, timeline_download_remote_layers_handler_get), + |r| api_handler(r, timeline_download_remote_layers_handler_get), ) .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - request_span(r, timeline_delete_handler) + api_handler(r, timeline_delete_handler) }) .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| { - request_span(r, layer_map_info_handler) + api_handler(r, layer_map_info_handler) }) .get( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - |r| request_span(r, layer_download_handler), + |r| api_handler(r, layer_download_handler), ) .delete( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", - |r| request_span(r, evict_timeline_layer_handler), + |r| api_handler(r, evict_timeline_layer_handler), ) .put("/v1/disk_usage_eviction/run", |r| { - request_span(r, disk_usage_eviction_run) + api_handler(r, disk_usage_eviction_run) + }) + .put("/v1/tenant/:tenant_id/break", |r| { + testing_api_handler("set tenant state to broken", r, handle_tenant_break) + }) + .get("/v1/panic", |r| api_handler(r, always_panic_handler)) + .post("/v1/tracing/event", |r| { + testing_api_handler("emit a tracing event", r, post_tracing_event_handler) }) - .put( - "/v1/tenant/:tenant_id/break", - testing_api!("set tenant state to broken", handle_tenant_break), - ) - .get("/v1/panic", |r| request_span(r, always_panic_handler)) - .post( - "/v1/tracing/event", - testing_api!("emit a tracing event", post_tracing_event_handler), - ) .any(handler_404)) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index d3cd914037..4318749777 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -779,7 +779,6 @@ pub async fn immediate_gc( Ok(wait_task_done) } -#[cfg(feature = "testing")] pub async fn immediate_compact( tenant_id: TenantId, timeline_id: TimelineId, diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 99bf400207..1e15a8e7cb 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -437,12 +437,22 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): wait_until(50, 0.1, got_hangup_log_message) - # ok, retry without failpoint, it should succeed + # check that the timeline is still present + ps_http.timeline_detail(env.initial_tenant, child_timeline_id) + + # ok, disable the failpoint to let the deletion finish ps_http.configure_failpoints((failpoint_name, "off")) - # this should succeed - ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2) - # the second call will try to transition the timeline into Stopping state, but it's already in that state - env.pageserver.allowed_errors.append( - f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" - ) + def first_request_finished(): + message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" + assert env.pageserver.log_contains(message) + + wait_until(50, 0.1, first_request_finished) + + # check that the timeline is gone + notfound_message = f"Timeline {env.initial_tenant}/{child_timeline_id} was not found" + env.pageserver.allowed_errors.append(".*" + notfound_message) + with pytest.raises(PageserverApiException, match=notfound_message) as exc: + ps_http.timeline_detail(env.initial_tenant, child_timeline_id) + + assert exc.value.status_code == 404