Compare commits

...

6 Commits

Author SHA1 Message Date
Joonas Koivunen
57a9d7d1e2 wip 2023-09-27 09:29:00 +00:00
Joonas Koivunen
fcc3da7642 refactor: unify error contexts not to have failed to 2023-09-27 09:29:00 +00:00
Joonas Koivunen
8449572c0f fix: switch to formatting just a string
cloning all of the paths twice seems a bit too much.
2023-09-27 09:29:00 +00:00
Joonas Koivunen
45582a0ec4 DownloadError: align Display with anyhow impl 2023-09-27 09:29:00 +00:00
Joonas Koivunen
c3a027b8b4 detect simulated errors, never print them full 2023-09-27 09:29:00 +00:00
Joonas Koivunen
4feb12548b use a pub type for simulated errors 2023-09-27 09:29:00 +00:00
4 changed files with 77 additions and 19 deletions

View File

@@ -25,7 +25,11 @@ use tokio::io;
use toml_edit::Item;
use tracing::info;
pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
pub use self::{
local_fs::LocalFs,
s3_bucket::S3Bucket,
simulate_failures::{SimulatedError, UnreliableWrapper},
};
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -190,26 +194,44 @@ impl Debug for Download {
#[derive(Debug)]
pub enum DownloadError {
/// Validation or other error happened due to user input.
///
/// This is only used by LOCAL_FS.
BadInput(anyhow::Error),
/// The file was not found in the remote storage.
///
/// This can only happen during download, never during delete.
NotFound,
/// The file was found in the remote storage, but the download failed.
/// The file was found in the remote storage, but the operation failed.
///
/// The error should have context already describing the real failed operation.
Other(anyhow::Error),
}
impl std::fmt::Display for DownloadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use DownloadError::*;
match self {
DownloadError::BadInput(e) => {
write!(f, "Failed to download a remote file due to user input: {e}")
}
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
NotFound => write!(f, "No file found for the remote object id given"),
// this is same as thiserror error(transparent); it handles {} and {:#}
Other(e) | BadInput(e) => std::fmt::Display::fmt(e, f),
}
}
}
impl std::error::Error for DownloadError {}
impl std::error::Error for DownloadError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
use DownloadError::*;
match self {
NotFound => None,
Other(_) | BadInput(_) => {
// TODO: these are anyhow, cannot return here
None
}
}
}
}
/// Every storage, currently supported.
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.

View File

@@ -345,7 +345,7 @@ impl RemoteStorage for S3Bucket {
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.context("Failed to list S3 prefixes")
.context("list S3 prefixes")
.map_err(DownloadError::Other);
let started_at = ScopeGuard::into_inner(started_at);
@@ -397,7 +397,7 @@ impl RemoteStorage for S3Bucket {
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.context("Failed to list files in S3 bucket");
.context("list files in S3 bucket");
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
@@ -521,10 +521,7 @@ impl RemoteStorage for S3Bucket {
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
return Err(anyhow::anyhow!("delete {} objects", errors.len()));
}
}
Err(e) => {

View File

@@ -18,7 +18,7 @@ pub struct UnreliableWrapper {
}
/// Used to identify retries of different unique operation.
#[derive(Debug, Hash, Eq, PartialEq)]
#[derive(Hash, Eq, PartialEq)]
enum RemoteOp {
ListPrefixes(Option<RemotePath>),
Upload(RemotePath),
@@ -27,6 +27,22 @@ enum RemoteOp {
DeleteObjects(Vec<RemotePath>),
}
impl std::fmt::Debug for RemoteOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use RemoteOp::*;
match self {
ListPrefixes(arg0) => f.debug_tuple("ListPrefixes").field(arg0).finish(),
Upload(arg0) => f.debug_tuple("Upload").field(arg0).finish(),
Download(arg0) => f.debug_tuple("Download").field(arg0).finish(),
Delete(arg0) => f.debug_tuple("Delete").field(arg0).finish(),
DeleteObjects(many) if many.len() > 3 => {
write!(f, "DeleteObjects({} paths)", many.len())
}
DeleteObjects(few) => f.debug_tuple("DeleteObjects").field(few).finish(),
}
}
}
impl UnreliableWrapper {
pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
assert!(attempts_to_fail > 0);
@@ -59,13 +75,12 @@ impl UnreliableWrapper {
e.remove();
Ok(attempts_before_this)
} else {
let error =
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
let error = anyhow::anyhow!(SimulatedError::from(e.key()));
Err(DownloadError::Other(error))
}
}
Entry::Vacant(e) => {
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
let error = anyhow::anyhow!(SimulatedError::from(e.key()));
e.insert(1);
Err(DownloadError::Other(error))
}
@@ -80,6 +95,26 @@ impl UnreliableWrapper {
}
}
/// `pub` type for checking if this is the root cause around logging.
///
/// This is just a string to avoid cloning a huge number of paths a second time.
#[derive(Debug)]
pub struct SimulatedError(String);
impl<'a> From<&'a RemoteOp> for SimulatedError {
fn from(value: &'_ RemoteOp) -> Self {
SimulatedError(format!("simulated failure of remote operation {:?}", value))
}
}
impl std::fmt::Display for SimulatedError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}
impl std::error::Error for SimulatedError {}
#[async_trait::async_trait]
impl RemoteStorage for UnreliableWrapper {
async fn list_prefixes(

View File

@@ -1161,7 +1161,11 @@ impl RemoteTimelineClient {
// at info level at first, and only WARN if the operation fails repeatedly.
//
// (See similar logic for downloads in `download::download_retry`)
if retries < FAILED_UPLOAD_WARN_THRESHOLD {
let is_simulated = cfg!(feature = "testing")
&& e.root_cause().is::<remote_storage::SimulatedError>();
if retries < FAILED_UPLOAD_WARN_THRESHOLD || is_simulated {
info!(
"failed to perform remote task {}, will retry (attempt {}): {:#}",
task.op, retries, e