diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 2e79698087..ea8d9858c3 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,11 +6,13 @@ use std::collections::HashSet; use std::future::Future; use std::path::Path; +use std::time::Duration; use anyhow::{anyhow, Context}; use tokio::fs; use tokio::io::AsyncWriteExt; -use tracing::{error, info, warn}; + +use tracing::{info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; @@ -26,6 +28,8 @@ async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Er fs::File::open(path).await?.sync_all().await } +static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); + /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) @@ -64,22 +68,28 @@ pub async fn download_layer_file<'a>( // TODO: this doesn't use the cached fd for some reason? let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { format!( - "Failed to create a destination file for layer '{}'", + "create a destination file for layer '{}'", temp_file_path.display() ) }) .map_err(DownloadError::Other)?; let mut download = storage.download(&remote_path).await.with_context(|| { format!( - "Failed to open a download stream for layer with remote storage path '{remote_path:?}'" + "open a download stream for layer with remote storage path '{remote_path:?}'" ) }) .map_err(DownloadError::Other)?; - let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { - format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") - }) - .map_err(DownloadError::Other)?; + + let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file)) + .await + .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))? + .with_context(|| { + format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") + }) + .map_err(DownloadError::Other)?; + Ok((destination_file, bytes_amount)) + }, &format!("download {remote_path:?}"), ).await?; @@ -300,7 +310,7 @@ where } Err(DownloadError::Other(ref err)) => { // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up. - error!("{description} still failed after {attempts} retries, giving up: {err:?}"); + warn!("{description} still failed after {attempts} retries, giving up: {err:?}"); return result; } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8b24fd6ecd..c304791ee2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3819,7 +3819,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. - info!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); + error!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download