From 7b9057ad0115c721fcdcd2585062576e993fb5a6 Mon Sep 17 00:00:00 2001 From: Shany Pozin Date: Mon, 6 Mar 2023 18:52:59 +0200 Subject: [PATCH] Add timeout to download copy (#3675) ## Describe your changes Adding a timeout handling for the remote download of layers of 120 seconds for each operation Note that these downloads are being retried for N times ## Issue ticket number and link Fixes: #3672 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --------- Co-authored-by: Joonas Koivunen --- .../tenant/remote_timeline_client/download.rs | 26 +++++++++++++------ pageserver/src/tenant/timeline.rs | 2 +- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 2e79698087..ea8d9858c3 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,11 +6,13 @@ use std::collections::HashSet; use std::future::Future; use std::path::Path; +use std::time::Duration; use anyhow::{anyhow, Context}; use tokio::fs; use tokio::io::AsyncWriteExt; -use tracing::{error, info, warn}; + +use tracing::{info, warn}; use crate::config::PageServerConf; use crate::tenant::storage_layer::LayerFileName; @@ -26,6 +28,8 @@ async fn fsync_path(path: impl AsRef) -> Result<(), std::io::Er fs::File::open(path).await?.sync_all().await } +static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120); + /// /// If 'metadata' is given, we will validate that the downloaded file's size matches that /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) @@ -64,22 +68,28 @@ pub async fn download_layer_file<'a>( // TODO: this doesn't use the cached fd for some reason? let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| { format!( - "Failed to create a destination file for layer '{}'", + "create a destination file for layer '{}'", temp_file_path.display() ) }) .map_err(DownloadError::Other)?; let mut download = storage.download(&remote_path).await.with_context(|| { format!( - "Failed to open a download stream for layer with remote storage path '{remote_path:?}'" + "open a download stream for layer with remote storage path '{remote_path:?}'" ) }) .map_err(DownloadError::Other)?; - let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| { - format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") - }) - .map_err(DownloadError::Other)?; + + let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file)) + .await + .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))? + .with_context(|| { + format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}") + }) + .map_err(DownloadError::Other)?; + Ok((destination_file, bytes_amount)) + }, &format!("download {remote_path:?}"), ).await?; @@ -300,7 +310,7 @@ where } Err(DownloadError::Other(ref err)) => { // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up. - error!("{description} still failed after {attempts} retries, giving up: {err:?}"); + warn!("{description} still failed after {attempts} retries, giving up: {err:?}"); return result; } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8b24fd6ecd..c304791ee2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3819,7 +3819,7 @@ impl Timeline { remote_layer.ongoing_download.close(); } else { // Keep semaphore open. We'll drop the permit at the end of the function. - info!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); + error!("on-demand download failed: {:?}", result.as_ref().unwrap_err()); } // Don't treat it as an error if the task that triggered the download