From 8d27a9c54e16967103c3aeeb261933718ec5f9b1 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 20 Jul 2023 17:45:10 +0300 Subject: [PATCH] Less verbose eviction failures (#4737) As seen in staging logs with some massive compactions (create_image_layer), in addition to racing with compaction or gc or even between two invocations to `evict_layer_batch`. Cc: #4745 Fixes: #3851 (organic tech debt reduction) Solution is not to log the Not Found in such cases; it is perfectly natural to happen. Route to this is quite long, but implemented two cases of "race between two eviction processes" which are like our disk usage based eviction and eviction_task, both have the separate "lets figure out what to evict" and "lets evict" phases. --- libs/utils/src/error.rs | 111 +++++++ libs/utils/src/lib.rs | 3 + pageserver/src/disk_usage_eviction_task.rs | 25 +- pageserver/src/tenant.rs | 20 +- pageserver/src/tenant/timeline.rs | 304 ++++++++++++++---- .../src/tenant/timeline/eviction_task.rs | 23 +- 6 files changed, 406 insertions(+), 80 deletions(-) create mode 100644 libs/utils/src/error.rs diff --git a/libs/utils/src/error.rs b/libs/utils/src/error.rs new file mode 100644 index 0000000000..7ce203e918 --- /dev/null +++ b/libs/utils/src/error.rs @@ -0,0 +1,111 @@ +/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting. +/// +/// It can be used with `anyhow::Error` as well. +/// +/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because +/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after +/// formatting. +/// +/// ## Usage +/// +/// ```rust +/// #[derive(Debug, thiserror::Error)] +/// enum MyCoolError { +/// #[error("should never happen")] +/// Bad(#[source] std::io::Error), +/// } +/// +/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) } +/// +/// # fn main() { +/// use utils::error::report_compact_sources; +/// +/// if let Err(e) = failing_call() { +/// let e = report_compact_sources(&e); +/// assert_eq!(format!("{e}"), "should never happen: permission denied"); +/// } +/// # } +/// ``` +/// +/// ## TODO +/// +/// When we are able to describe return position impl trait in traits, this should of course be an +/// extension trait. Until then avoid boxing with this more ackward interface. +pub fn report_compact_sources(e: &E) -> impl std::fmt::Display + '_ { + struct AnyhowDisplayAlternateAlike<'a, E>(&'a E); + + impl std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0)?; + + // why is E a generic parameter here? hope that rustc will see through a default + // Error::source implementation and leave the following out if there cannot be any + // sources: + Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src)) + } + } + + struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>); + + impl<'a> Iterator for Sources<'a> { + type Item = &'a (dyn std::error::Error + 'static); + + fn next(&mut self) -> Option { + let rem = self.0; + + let next = self.0.and_then(|x| x.source()); + self.0 = next; + rem + } + } + + AnyhowDisplayAlternateAlike(e) +} + +#[cfg(test)] +mod tests { + use super::report_compact_sources; + + #[test] + fn report_compact_sources_examples() { + use std::fmt::Write; + + #[derive(Debug, thiserror::Error)] + enum EvictionError { + #[error("cannot evict a remote layer")] + CannotEvictRemoteLayer, + #[error("stat failed")] + StatFailed(#[source] std::io::Error), + #[error("layer was no longer part of LayerMap")] + LayerNotFound(#[source] anyhow::Error), + } + + let examples = [ + ( + line!(), + EvictionError::CannotEvictRemoteLayer, + "cannot evict a remote layer", + ), + ( + line!(), + EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()), + "stat failed: permission denied", + ), + ( + line!(), + EvictionError::LayerNotFound(anyhow::anyhow!("foobar")), + "layer was no longer part of LayerMap: foobar", + ), + ]; + + let mut s = String::new(); + + for (line, example, expected) in examples { + s.clear(); + + write!(s, "{}", report_compact_sources(&example)).expect("string grows"); + + assert_eq!(s, expected, "example on line {line}"); + } + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 3bcb092ba7..b591cc611a 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -63,6 +63,9 @@ pub mod rate_limit; /// Simple once-barrier and a guard which keeps barrier awaiting. pub mod completion; +/// Reporting utilities +pub mod error; + mod failpoint_macro_helpers { /// use with fail::cfg("$name", "return(2000)") diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 37c52c5423..042d4c6d06 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -60,7 +60,7 @@ use utils::serde_percent::Percent; use crate::{ config::PageServerConf, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - tenant::{self, storage_layer::PersistentLayer, Timeline}, + tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline}, }; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -390,13 +390,22 @@ pub async fn disk_usage_eviction_task_iteration_impl( assert_eq!(results.len(), batch.len()); for (result, layer) in results.into_iter().zip(batch.iter()) { match result { - Some(Ok(true)) => { + Some(Ok(())) => { usage_assumed.add_available_bytes(layer.file_size()); } - Some(Ok(false)) => { - // this is: - // - Replacement::{NotFound, Unexpected} - // - it cannot be is_remote_layer, filtered already + Some(Err(EvictionError::CannotEvictRemoteLayer)) => { + unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers") + } + Some(Err(EvictionError::FileNotFound)) => { + evictions_failed.file_sizes += layer.file_size(); + evictions_failed.count += 1; + } + Some(Err( + e @ EvictionError::LayerNotFound(_) + | e @ EvictionError::StatFailed(_), + )) => { + let e = utils::error::report_compact_sources(&e); + warn!(%layer, "failed to evict layer: {e}"); evictions_failed.file_sizes += layer.file_size(); evictions_failed.count += 1; } @@ -404,10 +413,6 @@ pub async fn disk_usage_eviction_task_iteration_impl( assert!(cancel.is_cancelled()); return; } - Some(Err(e)) => { - // we really shouldn't be getting this, precondition failure - error!("failed to evict layer: {:#}", e); - } } } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 379db0720f..44f1a6cd65 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -121,7 +121,7 @@ pub mod mgr; pub mod tasks; pub mod upload_queue; -mod timeline; +pub(crate) mod timeline; pub mod size; @@ -1168,7 +1168,7 @@ impl Tenant { ) } - /// Helper for unit tests to create an emtpy timeline. + /// Helper for unit tests to create an empty timeline. /// /// The timeline is has state value `Active` but its background loops are not running. // This makes the various functions which anyhow::ensure! for Active state work in tests. @@ -3359,14 +3359,18 @@ pub mod harness { pub async fn load(&self) -> (Arc, RequestContext) { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); ( - self.try_load(&ctx) + self.try_load(&ctx, None) .await .expect("failed to load test tenant"), ctx, ) } - pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { + pub async fn try_load( + &self, + ctx: &RequestContext, + remote_storage: Option, + ) -> anyhow::Result> { let walredo_mgr = Arc::new(TestRedoManager); let tenant = Arc::new(Tenant::new( @@ -3375,7 +3379,7 @@ pub mod harness { TenantConfOpt::from(self.tenant_conf), walredo_mgr, self.tenant_id, - None, + remote_storage, )); tenant .load(None, ctx) @@ -3913,7 +3917,11 @@ mod tests { metadata_bytes[8] ^= 1; std::fs::write(metadata_path, metadata_bytes)?; - let err = harness.try_load(&ctx).await.err().expect("should fail"); + let err = harness + .try_load(&ctx, None) + .await + .err() + .expect("should fail"); // get all the stack with all .context, not tonly the last one let message = format!("{err:#}"); let expected = "Failed to parse metadata bytes from path"; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 58144d9050..887deeedf0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1011,11 +1011,11 @@ impl Timeline { .evict_layer_batch(remote_client, &[local_layer], cancel) .await?; assert_eq!(results.len(), 1); - let result: Option> = results.into_iter().next().unwrap(); + let result: Option> = results.into_iter().next().unwrap(); match result { None => anyhow::bail!("task_mgr shutdown requested"), - Some(Ok(b)) => Ok(Some(b)), - Some(Err(e)) => Err(e), + Some(Ok(())) => Ok(Some(true)), + Some(Err(e)) => Err(anyhow::Error::new(e)), } } @@ -1024,12 +1024,12 @@ impl Timeline { /// GenericRemoteStorage reference is required as a (witness)[witness_article] for "remote storage is configured." /// /// [witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html - pub async fn evict_layers( + pub(crate) async fn evict_layers( &self, _: &GenericRemoteStorage, layers_to_evict: &[Arc], cancel: CancellationToken, - ) -> anyhow::Result>>> { + ) -> anyhow::Result>>> { let remote_client = self.remote_client.clone().expect( "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient", ); @@ -1064,7 +1064,7 @@ impl Timeline { remote_client: &Arc, layers_to_evict: &[Arc], cancel: CancellationToken, - ) -> anyhow::Result>>> { + ) -> anyhow::Result>>> { // ensure that the layers have finished uploading // (don't hold the layer_removal_cs while we do it, we're not removing anything yet) remote_client @@ -1110,11 +1110,9 @@ impl Timeline { _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, local_layer: &Arc, layer_mgr: &mut LayerManager, - ) -> anyhow::Result { + ) -> Result<(), EvictionError> { if local_layer.is_remote_layer() { - // TODO(issue #3851): consider returning an err here instead of false, - // which is the same out the match later - return Ok(false); + return Err(EvictionError::CannotEvictRemoteLayer); } let layer_file_size = local_layer.file_size(); @@ -1123,13 +1121,22 @@ impl Timeline { .local_path() .expect("local layer should have a local path") .metadata() - .context("get local layer file stat")? + // when the eviction fails because we have already deleted the layer in compaction for + // example, a NotFound error bubbles up from here. + .map_err(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + EvictionError::FileNotFound + } else { + EvictionError::StatFailed(e) + } + })? .modified() - .context("get mtime of layer file")?; + .map_err(EvictionError::StatFailed)?; + let local_layer_residence_duration = match SystemTime::now().duration_since(local_layer_mtime) { Err(e) => { - warn!("layer mtime is in the future: {}", e); + warn!(layer = %local_layer, "layer mtime is in the future: {}", e); None } Ok(delta) => Some(delta), @@ -1160,54 +1167,65 @@ impl Timeline { assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc()); - let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) { - Ok(()) => { - if let Err(e) = local_layer.delete_resident_layer_file() { - error!("failed to remove layer file on evict after replacement: {e:#?}"); - } - // Always decrement the physical size gauge, even if we failed to delete the file. - // Rationale: we already replaced the layer with a remote layer in the layer map, - // and any subsequent download_remote_layer will - // 1. overwrite the file on disk and - // 2. add the downloaded size to the resident size gauge. - // - // If there is no re-download, and we restart the pageserver, then load_layer_map - // will treat the file as a local layer again, count it towards resident size, - // and it'll be like the layer removal never happened. - // The bump in resident size is perhaps unexpected but overall a robust behavior. - self.metrics - .resident_physical_size_gauge - .sub(layer_file_size); + layer_mgr + .replace_and_verify(local_layer.clone(), new_remote_layer) + .map_err(EvictionError::LayerNotFound)?; - self.metrics.evictions.inc(); + if let Err(e) = local_layer.delete_resident_layer_file() { + // this should never happen, because of layer_removal_cs usage and above stat + // access for mtime + error!("failed to remove layer file on evict after replacement: {e:#?}"); + } + // Always decrement the physical size gauge, even if we failed to delete the file. + // Rationale: we already replaced the layer with a remote layer in the layer map, + // and any subsequent download_remote_layer will + // 1. overwrite the file on disk and + // 2. add the downloaded size to the resident size gauge. + // + // If there is no re-download, and we restart the pageserver, then load_layer_map + // will treat the file as a local layer again, count it towards resident size, + // and it'll be like the layer removal never happened. + // The bump in resident size is perhaps unexpected but overall a robust behavior. + self.metrics + .resident_physical_size_gauge + .sub(layer_file_size); - if let Some(delta) = local_layer_residence_duration { - self.metrics - .evictions_with_low_residence_duration - .read() - .unwrap() - .observe(delta); - info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period"); - } else { - info!(layer=%local_layer, "evicted layer after unknown residence period"); - } + self.metrics.evictions.inc(); - true - } - Err(err) => { - if cfg!(debug_assertions) { - panic!("failed to replace: {err}, evicted: {local_layer:?}"); - } else { - error!(evicted=?local_layer, "failed to replace: {err}"); - } - false - } - }; + if let Some(delta) = local_layer_residence_duration { + self.metrics + .evictions_with_low_residence_duration + .read() + .unwrap() + .observe(delta); + info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period"); + } else { + info!(layer=%local_layer, "evicted layer after unknown residence period"); + } - Ok(succeed) + Ok(()) } } +#[derive(Debug, thiserror::Error)] +pub(crate) enum EvictionError { + #[error("cannot evict a remote layer")] + CannotEvictRemoteLayer, + /// Most likely the to-be evicted layer has been deleted by compaction or gc which use the same + /// locks, so they got to execute before the eviction. + #[error("file backing the layer has been removed already")] + FileNotFound, + #[error("stat failed")] + StatFailed(#[source] std::io::Error), + /// In practice, this can be a number of things, but lets assume it means only this. + /// + /// This case includes situations such as the Layer was evicted and redownloaded in between, + /// because the file existed before an replacement attempt was made but now the Layers are + /// different objects in memory. + #[error("layer was no longer part of LayerMap")] + LayerNotFound(#[source] anyhow::Error), +} + /// Number of times we will compute partition within a checkpoint distance. const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; @@ -4572,6 +4590,7 @@ impl LocalLayerInfoForDiskUsageEviction { } impl Timeline { + /// Returns non-remote layers for eviction. pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let guard = self.layers.read().await; let layers = guard.layer_map(); @@ -4741,3 +4760,180 @@ pub fn compare_arced_layers(left: &Arc, right: &Arc) -> bool { left == right } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use utils::{id::TimelineId, lsn::Lsn}; + + use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer}; + + use super::{EvictionError, Timeline}; + + #[tokio::test] + async fn two_layer_eviction_attempts_at_the_same_time() { + let harness = + TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); + + let remote_storage = { + // this is never used for anything, because of how the create_test_timeline works, but + // it is with us in spirit and a Some. + use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind}; + let path = harness.conf.workdir.join("localfs"); + std::fs::create_dir_all(&path).unwrap(); + let config = RemoteStorageConfig { + max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), + max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), + storage: RemoteStorageKind::LocalFs(path), + }; + GenericRemoteStorage::from_config(&config).unwrap() + }; + + let ctx = any_context(); + let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap(); + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let rc = timeline + .remote_client + .clone() + .expect("just configured this"); + + let layer = find_some_layer(&timeline).await; + + let cancel = tokio_util::sync::CancellationToken::new(); + let batch = [layer]; + + let first = { + let cancel = cancel.clone(); + async { + timeline + .evict_layer_batch(&rc, &batch, cancel) + .await + .unwrap() + } + }; + let second = async { + timeline + .evict_layer_batch(&rc, &batch, cancel) + .await + .unwrap() + }; + + let (first, second) = tokio::join!(first, second); + + let (first, second) = (only_one(first), only_one(second)); + + match (first, second) { + (Ok(()), Err(EvictionError::FileNotFound)) + | (Err(EvictionError::FileNotFound), Ok(())) => { + // one of the evictions gets to do it, + // other one gets FileNotFound. all is good. + } + other => unreachable!("unexpected {:?}", other), + } + } + + #[tokio::test] + async fn layer_eviction_aba_fails() { + let harness = + TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); + + let remote_storage = { + // this is never used for anything, because of how the create_test_timeline works, but + // it is with us in spirit and a Some. + use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind}; + let path = harness.conf.workdir.join("localfs"); + std::fs::create_dir_all(&path).unwrap(); + let config = RemoteStorageConfig { + max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(), + max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(), + storage: RemoteStorageKind::LocalFs(path), + }; + GenericRemoteStorage::from_config(&config).unwrap() + }; + + let ctx = any_context(); + let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap(); + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let _e = tracing::info_span!("foobar", tenant_id = %tenant.tenant_id, timeline_id = %timeline.timeline_id).entered(); + + let rc = timeline.remote_client.clone().unwrap(); + + // TenantHarness allows uploads to happen given GenericRemoteStorage is configured + let layer = find_some_layer(&timeline).await; + + let cancel = tokio_util::sync::CancellationToken::new(); + let batch = [layer]; + + let first = { + let cancel = cancel.clone(); + async { + timeline + .evict_layer_batch(&rc, &batch, cancel) + .await + .unwrap() + } + }; + + // lets imagine this is stuck somehow, still referencing the original `Arc` + let second = { + let cancel = cancel.clone(); + async { + timeline + .evict_layer_batch(&rc, &batch, cancel) + .await + .unwrap() + } + }; + + // while it's stuck, we evict and end up redownloading it + only_one(first.await).expect("eviction succeeded"); + + let layer = find_some_layer(&timeline).await; + let layer = layer.downcast_remote_layer().unwrap(); + timeline.download_remote_layer(layer).await.unwrap(); + + let res = only_one(second.await); + + assert!( + matches!(res, Err(EvictionError::LayerNotFound(_))), + "{res:?}" + ); + + // no more specific asserting, outside of preconds this is the only valid replacement + // failure + } + + fn any_context() -> crate::context::RequestContext { + use crate::context::*; + use crate::task_mgr::*; + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) + } + + fn only_one(mut input: Vec>) -> T { + assert_eq!(1, input.len()); + input + .pop() + .expect("length just checked") + .expect("no cancellation") + } + + async fn find_some_layer(timeline: &Timeline) -> Arc { + let layers = timeline.layers.read().await; + let desc = layers + .layer_map() + .iter_historic_layers() + .next() + .expect("must find one layer to evict"); + + layers.get_from_desc(&desc) + } +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index a5f570d942..5485cc42b4 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,6 +30,7 @@ use crate::{ tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, storage_layer::PersistentLayer, + timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, }; @@ -270,20 +271,22 @@ impl Timeline { None => { stats.skipped_for_shutdown += 1; } - Some(Ok(true)) => { - debug!("evicted layer {l:?}"); + Some(Ok(())) => { stats.evicted += 1; } - Some(Ok(false)) => { - debug!("layer is not evictable: {l:?}"); + Some(Err(EvictionError::CannotEvictRemoteLayer)) => { stats.not_evictable += 1; } - Some(Err(e)) => { - // This variant is the case where an unexpected error happened during eviction. - // Expected errors that result in non-eviction are `Some(Ok(false))`. - // So, dump Debug here to gather as much info as possible in this rare case. - warn!("failed to evict layer {l:?}: {e:?}"); - stats.errors += 1; + Some(Err(EvictionError::FileNotFound)) => { + // compaction/gc removed the file while we were waiting on layer_removal_cs + stats.not_evictable += 1; + } + Some(Err( + e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_), + )) => { + let e = utils::error::report_compact_sources(&e); + warn!(layer = %l, "failed to evict layer: {e}"); + stats.not_evictable += 1; } } }