mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-19 22:20:37 +00:00
## Problem The current implementation of struct Layer supports canceled read requests, but those will leave the internal state such that a following `Layer::keep_resident` call will need to repair the state. In pathological cases seen during generation numbers resetting in staging or with too many in-progress on-demand downloads, this repair activity will need to wait for the download to complete, which stalls disk usage-based eviction. Similar stalls have been observed in staging near disk-full situations, where downloads failed because the disk was full. Fixes #6028 or the "layer is present on filesystem but not evictable" problems by: 1. not canceling pending evictions by a canceled `LayerInner::get_or_maybe_download` 2. completing post-download initialization of the `LayerInner::inner` from the download task Not canceling evictions above case (1) and always initializing (2) lead to plain `LayerInner::inner` always having the up-to-date information, which leads to the old `Layer::keep_resident` never having to wait for downloads to complete. Finally, the `Layer::keep_resident` is replaced with `Layer::is_likely_resident`. These fix #7145. ## Summary of changes - add a new test showing that a canceled get_or_maybe_download should not cancel the eviction - switch to using a `watch` internally rather than a `broadcast` to avoid hanging eviction while a download is ongoing - doc changes for new semantics and cleanup - fix `Layer::keep_resident` to use just `self.0.inner.get()` as truth as `Layer::is_likely_resident` - remove `LayerInner::wanted_evicted` boolean as no longer needed Builds upon: #7185. Cc: #5331.
120 lines
3.5 KiB
Rust
120 lines
3.5 KiB
Rust
//! failpoints for unit tests, implying `#[cfg(test)]`.
|
|
//!
|
|
//! These are not accessible over http.
|
|
|
|
use super::*;
|
|
|
|
impl Layer {
|
|
/// Enable a failpoint from a unit test.
|
|
pub(super) fn enable_failpoint(&self, failpoint: Failpoint) {
|
|
self.0.failpoints.lock().unwrap().push(failpoint);
|
|
}
|
|
}
|
|
|
|
impl LayerInner {
|
|
/// Query if this failpoint is enabled, as in, arrive at a failpoint.
|
|
///
|
|
/// Calls to this method need to be `#[cfg(test)]` guarded.
|
|
pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> {
|
|
let fut = {
|
|
let mut fps = self.failpoints.lock().unwrap();
|
|
// find the *last* failpoint for cases in which we need to use multiple for the same
|
|
// thing (two blocked evictions)
|
|
let fp = fps.iter_mut().rfind(|x| x.kind() == kind);
|
|
|
|
let Some(fp) = fp else {
|
|
return Ok(());
|
|
};
|
|
|
|
fp.hit()
|
|
};
|
|
|
|
fut.await
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
pub(crate) enum FailpointKind {
|
|
/// Failpoint acts as an accurate cancelled by drop here; see the only site of use.
|
|
AfterDeterminingLayerNeedsNoDownload,
|
|
/// Failpoint for stalling eviction starting
|
|
WaitBeforeStartingEvicting,
|
|
/// Failpoint hit in the spawned task
|
|
WaitBeforeDownloading,
|
|
}
|
|
|
|
pub(crate) enum Failpoint {
|
|
AfterDeterminingLayerNeedsNoDownload,
|
|
WaitBeforeStartingEvicting(
|
|
Option<utils::completion::Completion>,
|
|
utils::completion::Barrier,
|
|
),
|
|
WaitBeforeDownloading(
|
|
Option<utils::completion::Completion>,
|
|
utils::completion::Barrier,
|
|
),
|
|
}
|
|
|
|
impl Failpoint {
|
|
fn kind(&self) -> FailpointKind {
|
|
match self {
|
|
Failpoint::AfterDeterminingLayerNeedsNoDownload => {
|
|
FailpointKind::AfterDeterminingLayerNeedsNoDownload
|
|
}
|
|
Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting,
|
|
Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading,
|
|
}
|
|
}
|
|
|
|
fn hit(&mut self) -> impl std::future::Future<Output = Result<(), FailpointHit>> + 'static {
|
|
use futures::future::FutureExt;
|
|
|
|
// use boxed futures to avoid Either hurdles
|
|
match self {
|
|
Failpoint::AfterDeterminingLayerNeedsNoDownload => {
|
|
let kind = self.kind();
|
|
|
|
async move { Err(FailpointHit(kind)) }.boxed()
|
|
}
|
|
Failpoint::WaitBeforeStartingEvicting(arrival, b)
|
|
| Failpoint::WaitBeforeDownloading(arrival, b) => {
|
|
// first one signals arrival
|
|
drop(arrival.take());
|
|
|
|
let b = b.clone();
|
|
|
|
async move {
|
|
tracing::trace!("waiting on a failpoint barrier");
|
|
b.wait().await;
|
|
tracing::trace!("done waiting on a failpoint barrier");
|
|
Ok(())
|
|
}
|
|
.boxed()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Display for FailpointKind {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
std::fmt::Debug::fmt(self, f)
|
|
}
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub(crate) struct FailpointHit(FailpointKind);
|
|
|
|
impl std::fmt::Display for FailpointHit {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
std::fmt::Debug::fmt(self, f)
|
|
}
|
|
}
|
|
|
|
impl std::error::Error for FailpointHit {}
|
|
|
|
impl From<FailpointHit> for DownloadError {
|
|
fn from(value: FailpointHit) -> Self {
|
|
DownloadError::Failpoint(value.0)
|
|
}
|
|
}
|