From 8154e887325ede131e0fdb4500269f6b51884ef8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 26 Jul 2024 16:48:44 +0200 Subject: [PATCH] refactor(layer load API): all errors are permanent (#8527) I am not aware of a case of "transient" VirtualFile errors as mentioned in https://github.com/neondatabase/neon/pull/5880 Private DM with Joonas discussing this: https://neondb.slack.com/archives/D049K7HJ9JM/p1721836424615799 --- .../src/tenant/storage_layer/delta_layer.rs | 31 ++++++------- .../src/tenant/storage_layer/image_layer.rs | 26 +++++------ pageserver/src/tenant/storage_layer/layer.rs | 44 ++++++++++--------- test_runner/regress/test_broken_timeline.py | 2 +- 4 files changed, 49 insertions(+), 54 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 586a7b7836..229d1e3608 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -307,12 +307,10 @@ impl DeltaLayer { .with_context(|| format!("Failed to load delta layer {}", self.path())) } - async fn load_inner(&self, ctx: &RequestContext) -> Result> { + async fn load_inner(&self, ctx: &RequestContext) -> anyhow::Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = DeltaLayerInner::load(&path, None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -760,27 +758,24 @@ impl DeltaLayerInner { &self.layer_lsn_range } - /// Returns nested result following Result, Critical>: - /// - inner has the success or transient failure - /// - outer has the permanent failure pub(super) async fn load( path: &Utf8Path, summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path, ctx).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; + let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); - let summary_blk = match block_reader.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // TODO: this should be an assertion instead; see ImageLayerInner::load let actual_summary = @@ -802,7 +797,7 @@ impl DeltaLayerInner { } } - Ok(Ok(DeltaLayerInner { + Ok(DeltaLayerInner { file, file_id, index_start_blk: actual_summary.index_start_blk, @@ -810,7 +805,7 @@ impl DeltaLayerInner { max_vectored_read_bytes, layer_key_range: actual_summary.key_range, layer_lsn_range: actual_summary.lsn_range, - })) + }) } pub(super) async fn get_value_reconstruct_data( diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index e5e7f71928..44ba685490 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -265,9 +265,8 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = + ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx).await?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -385,17 +384,16 @@ impl ImageLayerInner { summary: Option, max_vectored_read_bytes: Option, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path, ctx).await { - Ok(file) => file, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), - }; + ) -> anyhow::Result { + let file = VirtualFile::open(path, ctx) + .await + .context("open layer file")?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); - let summary_blk = match block_reader.read_blk(0, ctx).await { - Ok(blk) => blk, - Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), - }; + let summary_blk = block_reader + .read_blk(0, ctx) + .await + .context("read first block")?; // length is the only way how this could fail, so it's not actually likely at all unless // read_blk returns wrong sized block. @@ -420,7 +418,7 @@ impl ImageLayerInner { } } - Ok(Ok(ImageLayerInner { + Ok(ImageLayerInner { index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, lsn, @@ -428,7 +426,7 @@ impl ImageLayerInner { file_id, max_vectored_read_bytes, key_range: actual_summary.key_range, - })) + }) } pub(super) async fn get_value_reconstruct_data( diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 619c4d044d..1075feb1d1 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1651,8 +1651,9 @@ impl Drop for DownloadedLayer { } impl DownloadedLayer { - /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`], or fails to - /// initialize it permanently. + /// Initializes the `DeltaLayerInner` or `ImageLayerInner` within [`LayerKind`]. + /// Failure to load the layer is sticky, i.e., future `get()` calls will return + /// the initial load failure immediately. /// /// `owner` parameter is a strong reference at the same `LayerInner` as the /// `DownloadedLayer::owner` would be when upgraded. Given how this method ends up called, @@ -1683,7 +1684,7 @@ impl DownloadedLayer { ctx, ) .await - .map(|res| res.map(LayerKind::Delta)) + .map(LayerKind::Delta) } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( @@ -1700,32 +1701,29 @@ impl DownloadedLayer { ctx, ) .await - .map(|res| res.map(LayerKind::Image)) + .map(LayerKind::Image) }; match res { - Ok(Ok(layer)) => Ok(Ok(layer)), - Ok(Err(transient)) => Err(transient), - Err(permanent) => { + Ok(layer) => Ok(layer), + Err(err) => { LAYER_IMPL_METRICS.inc_permanent_loading_failures(); - // TODO(#5815): we are not logging all errors, so temporarily log them **once** - // here as well - let permanent = permanent.context("load layer"); - tracing::error!("layer loading failed permanently: {permanent:#}"); - Ok(Err(permanent)) + // We log this message once over the lifetime of `Self` + // => Ok and good to log backtrace and path here. + tracing::error!( + "layer load failed, assuming permanent failure: {}: {err:?}", + owner.path + ); + Err(err) } } }; self.kind - .get_or_try_init(init) - // return transient errors using `?` - .await? + .get_or_init(init) + .await .as_ref() - .map_err(|e| { - // errors are not clonabled, cannot but stringify - // test_broken_timeline matches this string - anyhow::anyhow!("layer loading failed: {e:#}") - }) + // We already logged the full backtrace above, once. Don't repeat that here. + .map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}")) } async fn get_value_reconstruct_data( @@ -1760,7 +1758,11 @@ impl DownloadedLayer { ) -> Result<(), GetVectoredError> { use LayerKind::*; - match self.get(owner, ctx).await.map_err(GetVectoredError::from)? { + match self + .get(owner, ctx) + .await + .map_err(GetVectoredError::Other)? + { Delta(d) => { d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) .await diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 976ac09335..5ec9a22ba1 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -27,7 +27,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): ".*will not become active. Current state: Broken.*", ".*failed to load metadata.*", ".*load failed.*load local timeline.*", - ".*layer loading failed permanently: load layer: .*", + ".*: layer load failed, assuming permanent failure:.*", ] )