From e1273acdb1e018352d97ea11bf65adf90db69c78 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 30 Jan 2025 22:43:36 +0000 Subject: [PATCH] pageserver: handle shutdown cleanly in layer download API (#10598) ## Problem This API is used in tests and occasionally for support. It cast all errors to 500. That can cause a failure on the log checks: https://neon-github-public-dev.s3.amazonaws.com/reports/main/13056992876/index.html#suites/ad9c266207b45eafe19909d1020dd987/683a7031d877f3db/ ## Summary of changes - Avoid using generic anyhow::Error for layer downloads - Map shutdown cases to 503 in http route --- pageserver/src/http/routes.rs | 8 +++++++- pageserver/src/tenant/storage_layer/layer.rs | 2 +- pageserver/src/tenant/timeline.rs | 12 ++++++++++-- test_runner/regress/test_ondemand_download.py | 19 +++++++------------ 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2548a11b2e..eb9cb4da0c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1472,7 +1472,13 @@ async fn layer_download_handler( let downloaded = timeline .download_layer(&layer_name) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| match e { + tenant::storage_layer::layer::DownloadError::TimelineShutdown + | tenant::storage_layer::layer::DownloadError::DownloadCancelled => { + ApiError::ShuttingDown + } + other => ApiError::InternalServerError(other.into()), + })?; match downloaded { Some(true) => json_response(StatusCode::OK, ()), diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 99e0ff1aa5..92313afba7 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -340,7 +340,7 @@ impl Layer { /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. - pub(crate) async fn download(&self) -> anyhow::Result<()> { + pub(crate) async fn download(&self) -> Result<(), DownloadError> { self.0.get_or_maybe_download(true, None).await?; Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f387c81c29..827601fa8b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2028,8 +2028,16 @@ impl Timeline { pub(crate) async fn download_layer( &self, layer_file_name: &LayerName, - ) -> anyhow::Result> { - let Some(layer) = self.find_layer(layer_file_name).await? else { + ) -> Result, super::storage_layer::layer::DownloadError> { + let Some(layer) = self + .find_layer(layer_file_name) + .await + .map_err(|e| match e { + layer_manager::Shutdown => { + super::storage_layer::layer::DownloadError::TimelineShutdown + } + })? + else { return Ok(None); }; diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 028d1c2e49..c344f30f4d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -27,6 +27,7 @@ from fixtures.pageserver.utils import ( ) from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import query_scalar, wait_until +from urllib3 import Retry if TYPE_CHECKING: from typing import Any @@ -676,16 +677,14 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu "compaction_period": "0s", } ) - client = env.pageserver.http_client() + + # Disable retries, because we'll hit code paths that can give us + # 503 and want to see that directly + client = env.pageserver.http_client(retries=Retry(status=0)) + failpoint = "before-downloading-layer-stream-pausable" client.configure_failpoints((failpoint, "pause")) - env.pageserver.allowed_errors.extend( - [ - ".*downloading failed, possibly for shutdown.*", - ] - ) - info = client.layer_map_info(env.initial_tenant, env.initial_timeline) assert len(info.delta_layers()) == 1 @@ -720,13 +719,9 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu client.configure_failpoints((failpoint, "off")) - with pytest.raises( - PageserverApiException, match="downloading failed, possibly for shutdown" - ): + with pytest.raises(PageserverApiException, match="Shutting down"): download.result() - env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*") - detach.result() client.configure_failpoints((failpoint, "pause"))