pageserver: handle shutdown cleanly in layer download API (#10598)

## Problem

This API is used in tests and occasionally for support. It cast all
errors to 500.

That can cause a failure on the log checks:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/13056992876/index.html#suites/ad9c266207b45eafe19909d1020dd987/683a7031d877f3db/

## Summary of changes

- Avoid using generic anyhow::Error for layer downloads
- Map shutdown cases to 503 in http route
This commit is contained in:
John Spray
2025-01-30 22:43:36 +00:00
committed by GitHub
parent d18f6198e1
commit e1273acdb1
4 changed files with 25 additions and 16 deletions

View File

@@ -1472,7 +1472,13 @@ async fn layer_download_handler(
let downloaded = timeline
.download_layer(&layer_name)
.await
.map_err(ApiError::InternalServerError)?;
.map_err(|e| match e {
tenant::storage_layer::layer::DownloadError::TimelineShutdown
| tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
ApiError::ShuttingDown
}
other => ApiError::InternalServerError(other.into()),
})?;
match downloaded {
Some(true) => json_response(StatusCode::OK, ()),

View File

@@ -340,7 +340,7 @@ impl Layer {
/// Download the layer if evicted.
///
/// Will not error when the layer is already downloaded.
pub(crate) async fn download(&self) -> anyhow::Result<()> {
pub(crate) async fn download(&self) -> Result<(), DownloadError> {
self.0.get_or_maybe_download(true, None).await?;
Ok(())
}

View File

@@ -2028,8 +2028,16 @@ impl Timeline {
pub(crate) async fn download_layer(
&self,
layer_file_name: &LayerName,
) -> anyhow::Result<Option<bool>> {
let Some(layer) = self.find_layer(layer_file_name).await? else {
) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
let Some(layer) = self
.find_layer(layer_file_name)
.await
.map_err(|e| match e {
layer_manager::Shutdown => {
super::storage_layer::layer::DownloadError::TimelineShutdown
}
})?
else {
return Ok(None);
};

View File

@@ -27,6 +27,7 @@ from fixtures.pageserver.utils import (
)
from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
from fixtures.utils import query_scalar, wait_until
from urllib3 import Retry
if TYPE_CHECKING:
from typing import Any
@@ -676,16 +677,14 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
"compaction_period": "0s",
}
)
client = env.pageserver.http_client()
# Disable retries, because we'll hit code paths that can give us
# 503 and want to see that directly
client = env.pageserver.http_client(retries=Retry(status=0))
failpoint = "before-downloading-layer-stream-pausable"
client.configure_failpoints((failpoint, "pause"))
env.pageserver.allowed_errors.extend(
[
".*downloading failed, possibly for shutdown.*",
]
)
info = client.layer_map_info(env.initial_tenant, env.initial_timeline)
assert len(info.delta_layers()) == 1
@@ -720,13 +719,9 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
client.configure_failpoints((failpoint, "off"))
with pytest.raises(
PageserverApiException, match="downloading failed, possibly for shutdown"
):
with pytest.raises(PageserverApiException, match="Shutting down"):
download.result()
env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*")
detach.result()
client.configure_failpoints((failpoint, "pause"))