From 3df67bf4d7d23a074cd0e45104e86ebc36315242 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 18 Apr 2024 18:27:58 +0300 Subject: [PATCH] fix(Layer): metric regression with too many canceled evictions (#7363) #7030 introduced an annoying papercut, deeming a failure to acquire a strong reference to `LayerInner` from `DownloadedLayer::drop` as a canceled eviction. Most of the time, it wasn't that, but just timeline deletion or tenant detach with the layer not wanting to be deleted or evicted. When a Layer is dropped as part of a normal shutdown, the `Layer` is dropped first, and the `DownloadedLayer` the second. Because of this, we cannot detect eviction being canceled from the `DownloadedLayer::drop`. We can detect it from `LayerInner::drop`, which this PR adds. Test case is added which before had 1 started eviction, 2 canceled. Now it accurately finds 1 started, 1 canceled. --- libs/utils/src/sync/heavier_once_cell.rs | 51 +++++++++- pageserver/src/tenant/storage_layer/layer.rs | 16 ++- .../src/tenant/storage_layer/layer/tests.rs | 97 +++++++++++++++++++ 3 files changed, 155 insertions(+), 9 deletions(-) diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 8eee1f72a6..1abd3d9861 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -192,6 +192,14 @@ impl OnceCell { } } + /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never + /// initialized. + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let inner = self.inner.get_mut().unwrap(); + + inner.take_and_deinit() + } + /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete. pub fn initializer_count(&self) -> usize { self.initializers.load(Ordering::Relaxed) @@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> { /// The permit will be on a semaphore part of the new internal value, and any following /// [`OnceCell::get_or_init`] will wait on it to complete. pub fn take_and_deinit(mut self) -> (T, InitPermit) { + self.0 + .take_and_deinit() + .expect("guard is not created unless value has been initialized") + } +} + +impl Inner { + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let value = self.value.take()?; + let mut swapped = Inner::default(); let sem = swapped.init_semaphore.clone(); // acquire and forget right away, moving the control over to InitPermit sem.try_acquire().expect("we just created this").forget(); - std::mem::swap(&mut *self.0, &mut swapped); - swapped - .value - .map(|v| (v, InitPermit(sem))) - .expect("guard is not created unless value has been initialized") + let permit = InitPermit(sem); + std::mem::swap(self, &mut swapped); + Some((value, permit)) } } @@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> { /// On drop, this type will return the permit. pub struct InitPermit(Arc); +impl std::fmt::Debug for InitPermit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let ptr = Arc::as_ptr(&self.0) as *const (); + f.debug_tuple("InitPermit").field(&ptr).finish() + } +} + impl Drop for InitPermit { fn drop(&mut self) { assert_eq!( @@ -559,4 +582,22 @@ mod tests { assert_eq!(*target.get().unwrap(), 11); } + + #[tokio::test] + async fn take_and_deinit_on_mut() { + use std::convert::Infallible; + + let mut target = OnceCell::::default(); + assert!(target.take_and_deinit().is_none()); + + target + .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) }) + .await + .unwrap(); + + let again = target.take_and_deinit(); + assert!(matches!(again, Some((42, _))), "{again:?}"); + + assert!(target.take_and_deinit().is_none()); + } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 291da0f645..e55299f0fa 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -610,9 +610,17 @@ enum Status { impl Drop for LayerInner { fn drop(&mut self) { + // if there was a pending eviction, mark it cancelled here to balance metrics + if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit() + { + // eviction has already been started + LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); + + // eviction request is intentionally not honored as no one is present to wait for it + // and we could be delaying shutdown for nothing. + } + if !*self.wanted_deleted.get_mut() { - // should we try to evict if the last wish was for eviction? seems more like a hazard - // than a clear win. return; } @@ -1558,8 +1566,8 @@ impl Drop for DownloadedLayer { if let Some(owner) = self.owner.upgrade() { owner.on_downloaded_layer_drop(self.version); } else { - // no need to do anything, we are shutting down - LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); + // Layer::drop will handle cancelling the eviction; because of drop order and + // `DownloadedLayer` never leaking, we cannot know here if eviction was requested. } } } diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 247ff123b5..f0697fdf28 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -721,6 +721,103 @@ async fn evict_and_wait_does_not_wait_for_download() { layer.evict_and_wait(FOREVER).await.unwrap(); } +/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident, +/// which is the last value. +/// +/// Also checks that the same does not happen on a non-evicted layer (regression test). +#[tokio::test(start_paused = true)] +async fn eviction_cancellation_on_drop() { + use crate::repository::Value; + use bytes::Bytes; + + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap(); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + { + // create_test_timeline wrote us one layer, write another + let mut writer = timeline.writer().await; + writer + .put( + Key::from_i128(5), + Lsn(0x20), + &Value::Image(Bytes::from_static(b"this does not matter either")), + &ctx, + ) + .await + .unwrap(); + + writer.finish_write(Lsn(0x20)); + } + + timeline.freeze_and_flush().await.unwrap(); + + // wait for the upload to complete so our Arc::strong_count assertion holds + timeline + .remote_client + .as_ref() + .unwrap() + .wait_completion() + .await + .unwrap(); + + let (evicted_layer, not_evicted) = { + let mut layers = { + let mut guard = timeline.layers.write().await; + let layers = guard.likely_resident_layers().collect::>(); + // remove the layers from layermap + guard.finish_gc_timeline(&layers); + + layers + }; + + assert_eq!(layers.len(), 2); + + (layers.pop().unwrap(), layers.pop().unwrap()) + }; + + let victims = [(evicted_layer, true), (not_evicted, false)]; + + for (victim, evict) in victims { + let resident = victim.keep_resident().await.unwrap(); + drop(victim); + + assert_eq!(Arc::strong_count(&resident.owner.0), 1); + + if evict { + let evict_and_wait = resident.owner.evict_and_wait(FOREVER); + + // drive the future to await on the status channel, and then drop it + tokio::time::timeout(ADVANCE, evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + } + + // 1 == we only evict one of the layers + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + drop(resident); + + // run any spawned + tokio::time::sleep(ADVANCE).await; + + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get() + ); + } +} + #[test] fn layer_size() { assert_eq!(std::mem::size_of::(), 2040);