Less verbose eviction failures (#4737)

As seen in staging logs with some massive compactions
(create_image_layer), in addition to racing with compaction or gc or
even between two invocations to `evict_layer_batch`.

Cc: #4745
Fixes: #3851 (organic tech debt reduction)

Solution is not to log the Not Found in such cases; it is perfectly
natural to happen. Route to this is quite long, but implemented two
cases of "race between two eviction processes" which are like our disk
usage based eviction and eviction_task, both have the separate "lets
figure out what to evict" and "lets evict" phases.
This commit is contained in:
Joonas Koivunen
2023-07-20 17:45:10 +03:00
committed by GitHub
parent d98cb39978
commit 8d27a9c54e
6 changed files with 406 additions and 80 deletions

111
libs/utils/src/error.rs Normal file
View File

@@ -0,0 +1,111 @@
/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
///
/// It can be used with `anyhow::Error` as well.
///
/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
/// formatting.
///
/// ## Usage
///
/// ```rust
/// #[derive(Debug, thiserror::Error)]
/// enum MyCoolError {
/// #[error("should never happen")]
/// Bad(#[source] std::io::Error),
/// }
///
/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
///
/// # fn main() {
/// use utils::error::report_compact_sources;
///
/// if let Err(e) = failing_call() {
/// let e = report_compact_sources(&e);
/// assert_eq!(format!("{e}"), "should never happen: permission denied");
/// }
/// # }
/// ```
///
/// ## TODO
///
/// When we are able to describe return position impl trait in traits, this should of course be an
/// extension trait. Until then avoid boxing with this more ackward interface.
pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)?;
// why is E a generic parameter here? hope that rustc will see through a default
// Error::source implementation and leave the following out if there cannot be any
// sources:
Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
}
}
struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
impl<'a> Iterator for Sources<'a> {
type Item = &'a (dyn std::error::Error + 'static);
fn next(&mut self) -> Option<Self::Item> {
let rem = self.0;
let next = self.0.and_then(|x| x.source());
self.0 = next;
rem
}
}
AnyhowDisplayAlternateAlike(e)
}
#[cfg(test)]
mod tests {
use super::report_compact_sources;
#[test]
fn report_compact_sources_examples() {
use std::fmt::Write;
#[derive(Debug, thiserror::Error)]
enum EvictionError {
#[error("cannot evict a remote layer")]
CannotEvictRemoteLayer,
#[error("stat failed")]
StatFailed(#[source] std::io::Error),
#[error("layer was no longer part of LayerMap")]
LayerNotFound(#[source] anyhow::Error),
}
let examples = [
(
line!(),
EvictionError::CannotEvictRemoteLayer,
"cannot evict a remote layer",
),
(
line!(),
EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
"stat failed: permission denied",
),
(
line!(),
EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
"layer was no longer part of LayerMap: foobar",
),
];
let mut s = String::new();
for (line, example, expected) in examples {
s.clear();
write!(s, "{}", report_compact_sources(&example)).expect("string grows");
assert_eq!(s, expected, "example on line {line}");
}
}
}

View File

@@ -63,6 +63,9 @@ pub mod rate_limit;
/// Simple once-barrier and a guard which keeps barrier awaiting.
pub mod completion;
/// Reporting utilities
pub mod error;
mod failpoint_macro_helpers {
/// use with fail::cfg("$name", "return(2000)")

View File

@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
use crate::{
config::PageServerConf,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{self, storage_layer::PersistentLayer, Timeline},
tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -390,13 +390,22 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
match result {
Some(Ok(true)) => {
Some(Ok(())) => {
usage_assumed.add_available_bytes(layer.file_size());
}
Some(Ok(false)) => {
// this is:
// - Replacement::{NotFound, Unexpected}
// - it cannot be is_remote_layer, filtered already
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
}
Some(Err(EvictionError::FileNotFound)) => {
evictions_failed.file_sizes += layer.file_size();
evictions_failed.count += 1;
}
Some(Err(
e @ EvictionError::LayerNotFound(_)
| e @ EvictionError::StatFailed(_),
)) => {
let e = utils::error::report_compact_sources(&e);
warn!(%layer, "failed to evict layer: {e}");
evictions_failed.file_sizes += layer.file_size();
evictions_failed.count += 1;
}
@@ -404,10 +413,6 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
assert!(cancel.is_cancelled());
return;
}
Some(Err(e)) => {
// we really shouldn't be getting this, precondition failure
error!("failed to evict layer: {:#}", e);
}
}
}
}

View File

@@ -121,7 +121,7 @@ pub mod mgr;
pub mod tasks;
pub mod upload_queue;
mod timeline;
pub(crate) mod timeline;
pub mod size;
@@ -1168,7 +1168,7 @@ impl Tenant {
)
}
/// Helper for unit tests to create an emtpy timeline.
/// Helper for unit tests to create an empty timeline.
///
/// The timeline is has state value `Active` but its background loops are not running.
// This makes the various functions which anyhow::ensure! for Active state work in tests.
@@ -3359,14 +3359,18 @@ pub mod harness {
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
(
self.try_load(&ctx)
self.try_load(&ctx, None)
.await
.expect("failed to load test tenant"),
ctx,
)
}
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
pub async fn try_load(
&self,
ctx: &RequestContext,
remote_storage: Option<remote_storage::GenericRemoteStorage>,
) -> anyhow::Result<Arc<Tenant>> {
let walredo_mgr = Arc::new(TestRedoManager);
let tenant = Arc::new(Tenant::new(
@@ -3375,7 +3379,7 @@ pub mod harness {
TenantConfOpt::from(self.tenant_conf),
walredo_mgr,
self.tenant_id,
None,
remote_storage,
));
tenant
.load(None, ctx)
@@ -3913,7 +3917,11 @@ mod tests {
metadata_bytes[8] ^= 1;
std::fs::write(metadata_path, metadata_bytes)?;
let err = harness.try_load(&ctx).await.err().expect("should fail");
let err = harness
.try_load(&ctx, None)
.await
.err()
.expect("should fail");
// get all the stack with all .context, not tonly the last one
let message = format!("{err:#}");
let expected = "Failed to parse metadata bytes from path";

View File

@@ -1011,11 +1011,11 @@ impl Timeline {
.evict_layer_batch(remote_client, &[local_layer], cancel)
.await?;
assert_eq!(results.len(), 1);
let result: Option<anyhow::Result<bool>> = results.into_iter().next().unwrap();
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
match result {
None => anyhow::bail!("task_mgr shutdown requested"),
Some(Ok(b)) => Ok(Some(b)),
Some(Err(e)) => Err(e),
Some(Ok(())) => Ok(Some(true)),
Some(Err(e)) => Err(anyhow::Error::new(e)),
}
}
@@ -1024,12 +1024,12 @@ impl Timeline {
/// GenericRemoteStorage reference is required as a (witness)[witness_article] for "remote storage is configured."
///
/// [witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
pub async fn evict_layers(
pub(crate) async fn evict_layers(
&self,
_: &GenericRemoteStorage,
layers_to_evict: &[Arc<dyn PersistentLayer>],
cancel: CancellationToken,
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
let remote_client = self.remote_client.clone().expect(
"GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
);
@@ -1064,7 +1064,7 @@ impl Timeline {
remote_client: &Arc<RemoteTimelineClient>,
layers_to_evict: &[Arc<dyn PersistentLayer>],
cancel: CancellationToken,
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
// ensure that the layers have finished uploading
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
remote_client
@@ -1110,11 +1110,9 @@ impl Timeline {
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
local_layer: &Arc<dyn PersistentLayer>,
layer_mgr: &mut LayerManager,
) -> anyhow::Result<bool> {
) -> Result<(), EvictionError> {
if local_layer.is_remote_layer() {
// TODO(issue #3851): consider returning an err here instead of false,
// which is the same out the match later
return Ok(false);
return Err(EvictionError::CannotEvictRemoteLayer);
}
let layer_file_size = local_layer.file_size();
@@ -1123,13 +1121,22 @@ impl Timeline {
.local_path()
.expect("local layer should have a local path")
.metadata()
.context("get local layer file stat")?
// when the eviction fails because we have already deleted the layer in compaction for
// example, a NotFound error bubbles up from here.
.map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
EvictionError::FileNotFound
} else {
EvictionError::StatFailed(e)
}
})?
.modified()
.context("get mtime of layer file")?;
.map_err(EvictionError::StatFailed)?;
let local_layer_residence_duration =
match SystemTime::now().duration_since(local_layer_mtime) {
Err(e) => {
warn!("layer mtime is in the future: {}", e);
warn!(layer = %local_layer, "layer mtime is in the future: {}", e);
None
}
Ok(delta) => Some(delta),
@@ -1160,54 +1167,65 @@ impl Timeline {
assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) {
Ok(()) => {
if let Err(e) = local_layer.delete_resident_layer_file() {
error!("failed to remove layer file on evict after replacement: {e:#?}");
}
// Always decrement the physical size gauge, even if we failed to delete the file.
// Rationale: we already replaced the layer with a remote layer in the layer map,
// and any subsequent download_remote_layer will
// 1. overwrite the file on disk and
// 2. add the downloaded size to the resident size gauge.
//
// If there is no re-download, and we restart the pageserver, then load_layer_map
// will treat the file as a local layer again, count it towards resident size,
// and it'll be like the layer removal never happened.
// The bump in resident size is perhaps unexpected but overall a robust behavior.
self.metrics
.resident_physical_size_gauge
.sub(layer_file_size);
layer_mgr
.replace_and_verify(local_layer.clone(), new_remote_layer)
.map_err(EvictionError::LayerNotFound)?;
self.metrics.evictions.inc();
if let Err(e) = local_layer.delete_resident_layer_file() {
// this should never happen, because of layer_removal_cs usage and above stat
// access for mtime
error!("failed to remove layer file on evict after replacement: {e:#?}");
}
// Always decrement the physical size gauge, even if we failed to delete the file.
// Rationale: we already replaced the layer with a remote layer in the layer map,
// and any subsequent download_remote_layer will
// 1. overwrite the file on disk and
// 2. add the downloaded size to the resident size gauge.
//
// If there is no re-download, and we restart the pageserver, then load_layer_map
// will treat the file as a local layer again, count it towards resident size,
// and it'll be like the layer removal never happened.
// The bump in resident size is perhaps unexpected but overall a robust behavior.
self.metrics
.resident_physical_size_gauge
.sub(layer_file_size);
if let Some(delta) = local_layer_residence_duration {
self.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(delta);
info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
} else {
info!(layer=%local_layer, "evicted layer after unknown residence period");
}
self.metrics.evictions.inc();
true
}
Err(err) => {
if cfg!(debug_assertions) {
panic!("failed to replace: {err}, evicted: {local_layer:?}");
} else {
error!(evicted=?local_layer, "failed to replace: {err}");
}
false
}
};
if let Some(delta) = local_layer_residence_duration {
self.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(delta);
info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
} else {
info!(layer=%local_layer, "evicted layer after unknown residence period");
}
Ok(succeed)
Ok(())
}
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum EvictionError {
#[error("cannot evict a remote layer")]
CannotEvictRemoteLayer,
/// Most likely the to-be evicted layer has been deleted by compaction or gc which use the same
/// locks, so they got to execute before the eviction.
#[error("file backing the layer has been removed already")]
FileNotFound,
#[error("stat failed")]
StatFailed(#[source] std::io::Error),
/// In practice, this can be a number of things, but lets assume it means only this.
///
/// This case includes situations such as the Layer was evicted and redownloaded in between,
/// because the file existed before an replacement attempt was made but now the Layers are
/// different objects in memory.
#[error("layer was no longer part of LayerMap")]
LayerNotFound(#[source] anyhow::Error),
}
/// Number of times we will compute partition within a checkpoint distance.
const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
@@ -4572,6 +4590,7 @@ impl LocalLayerInfoForDiskUsageEviction {
}
impl Timeline {
/// Returns non-remote layers for eviction.
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
let guard = self.layers.read().await;
let layers = guard.layer_map();
@@ -4741,3 +4760,180 @@ pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {
left == right
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use utils::{id::TimelineId, lsn::Lsn};
use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};
use super::{EvictionError, Timeline};
#[tokio::test]
async fn two_layer_eviction_attempts_at_the_same_time() {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let remote_storage = {
// this is never used for anything, because of how the create_test_timeline works, but
// it is with us in spirit and a Some.
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
let path = harness.conf.workdir.join("localfs");
std::fs::create_dir_all(&path).unwrap();
let config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(path),
};
GenericRemoteStorage::from_config(&config).unwrap()
};
let ctx = any_context();
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let rc = timeline
.remote_client
.clone()
.expect("just configured this");
let layer = find_some_layer(&timeline).await;
let cancel = tokio_util::sync::CancellationToken::new();
let batch = [layer];
let first = {
let cancel = cancel.clone();
async {
timeline
.evict_layer_batch(&rc, &batch, cancel)
.await
.unwrap()
}
};
let second = async {
timeline
.evict_layer_batch(&rc, &batch, cancel)
.await
.unwrap()
};
let (first, second) = tokio::join!(first, second);
let (first, second) = (only_one(first), only_one(second));
match (first, second) {
(Ok(()), Err(EvictionError::FileNotFound))
| (Err(EvictionError::FileNotFound), Ok(())) => {
// one of the evictions gets to do it,
// other one gets FileNotFound. all is good.
}
other => unreachable!("unexpected {:?}", other),
}
}
#[tokio::test]
async fn layer_eviction_aba_fails() {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let remote_storage = {
// this is never used for anything, because of how the create_test_timeline works, but
// it is with us in spirit and a Some.
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
let path = harness.conf.workdir.join("localfs");
std::fs::create_dir_all(&path).unwrap();
let config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(path),
};
GenericRemoteStorage::from_config(&config).unwrap()
};
let ctx = any_context();
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let _e = tracing::info_span!("foobar", tenant_id = %tenant.tenant_id, timeline_id = %timeline.timeline_id).entered();
let rc = timeline.remote_client.clone().unwrap();
// TenantHarness allows uploads to happen given GenericRemoteStorage is configured
let layer = find_some_layer(&timeline).await;
let cancel = tokio_util::sync::CancellationToken::new();
let batch = [layer];
let first = {
let cancel = cancel.clone();
async {
timeline
.evict_layer_batch(&rc, &batch, cancel)
.await
.unwrap()
}
};
// lets imagine this is stuck somehow, still referencing the original `Arc<dyn PersistentLayer>`
let second = {
let cancel = cancel.clone();
async {
timeline
.evict_layer_batch(&rc, &batch, cancel)
.await
.unwrap()
}
};
// while it's stuck, we evict and end up redownloading it
only_one(first.await).expect("eviction succeeded");
let layer = find_some_layer(&timeline).await;
let layer = layer.downcast_remote_layer().unwrap();
timeline.download_remote_layer(layer).await.unwrap();
let res = only_one(second.await);
assert!(
matches!(res, Err(EvictionError::LayerNotFound(_))),
"{res:?}"
);
// no more specific asserting, outside of preconds this is the only valid replacement
// failure
}
fn any_context() -> crate::context::RequestContext {
use crate::context::*;
use crate::task_mgr::*;
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
fn only_one<T>(mut input: Vec<Option<T>>) -> T {
assert_eq!(1, input.len());
input
.pop()
.expect("length just checked")
.expect("no cancellation")
}
async fn find_some_layer(timeline: &Timeline) -> Arc<dyn PersistentLayer> {
let layers = timeline.layers.read().await;
let desc = layers
.layer_map()
.iter_historic_layers()
.next()
.expect("must find one layer to evict");
layers.get_from_desc(&desc)
}
}

View File

@@ -30,6 +30,7 @@ use crate::{
tenant::{
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
storage_layer::PersistentLayer,
timeline::EvictionError,
LogicalSizeCalculationCause, Tenant,
},
};
@@ -270,20 +271,22 @@ impl Timeline {
None => {
stats.skipped_for_shutdown += 1;
}
Some(Ok(true)) => {
debug!("evicted layer {l:?}");
Some(Ok(())) => {
stats.evicted += 1;
}
Some(Ok(false)) => {
debug!("layer is not evictable: {l:?}");
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
stats.not_evictable += 1;
}
Some(Err(e)) => {
// This variant is the case where an unexpected error happened during eviction.
// Expected errors that result in non-eviction are `Some(Ok(false))`.
// So, dump Debug here to gather as much info as possible in this rare case.
warn!("failed to evict layer {l:?}: {e:?}");
stats.errors += 1;
Some(Err(EvictionError::FileNotFound)) => {
// compaction/gc removed the file while we were waiting on layer_removal_cs
stats.not_evictable += 1;
}
Some(Err(
e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
)) => {
let e = utils::error::report_compact_sources(&e);
warn!(layer = %l, "failed to evict layer: {e}");
stats.not_evictable += 1;
}
}
}