From 552088ac1635f88a65914f6fc540fd343db66e57 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 31 Oct 2024 14:44:59 +0000 Subject: [PATCH] pageserver: fix spurious error logs in timeline lifecycle (#9589) ## Problem The final part of https://github.com/neondatabase/neon/issues/9543 will be a chaos test that creates/deletes/archives/offloads timelines while restarting pageservers and migrating tenants. Developing that test showed up a few places where we log errors during normal shutdown. ## Summary of changes - UninitializedTimeline's drop should log at info severity: this is a normal code path when some part of timeline creation encounters a cancellation `?` path. - When offloading and finding a `RemoteTimelineClient` in a non-initialized state, this is not an error and should not be logged as such. - The `offload_timeline` function returned an anyhow error, so callers couldn't gracefully pick out cancellation errors from real errors: update this to have a structured error type and use it throughout. --- pageserver/src/http/routes.rs | 9 +++- pageserver/src/tenant.rs | 8 ++- pageserver/src/tenant/tasks.rs | 1 + pageserver/src/tenant/timeline.rs | 13 +++++ pageserver/src/tenant/timeline/offload.rs | 56 ++++++++++++++------- pageserver/src/tenant/timeline/uninit.rs | 4 +- test_runner/regress/test_broken_timeline.py | 2 - test_runner/regress/test_import.py | 1 - test_runner/regress/test_tenant_delete.py | 2 - 9 files changed, 68 insertions(+), 28 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2d8f4309ca..ef8efd3f27 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -80,6 +80,7 @@ use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::offload::offload_timeline; +use crate::tenant::timeline::offload::OffloadError; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; @@ -2004,7 +2005,12 @@ async fn timeline_offload_handler( } offload_timeline(&tenant, &timeline) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| { + match e { + OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()), + _ => ApiError::InternalServerError(anyhow!(e)) + } + })?; json_response(StatusCode::OK, ()) } @@ -2060,6 +2066,7 @@ async fn timeline_checkpoint_handler( .map_err(|e| match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, + CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e) } )?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8237f4662c..68f8f7e13c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2529,6 +2529,11 @@ impl Tenant { .await .inspect_err(|e| match e { timeline::CompactionError::ShuttingDown => (), + timeline::CompactionError::Offload(_) => { + // Failures to offload timelines do not trip the circuit breaker, because + // they do not do lots of writes the way compaction itself does: it is cheap + // to retry, and it would be bad to stop all compaction because of an issue with offloading. + } timeline::CompactionError::Other(e) => { self.compaction_circuit_breaker .lock() @@ -2544,8 +2549,7 @@ impl Tenant { if pending_task_left == Some(false) && *can_offload { offload_timeline(self, timeline) .instrument(info_span!("offload_timeline", %timeline_id)) - .await - .map_err(timeline::CompactionError::Other)?; + .await?; } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 547739e773..16dac10dca 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -279,6 +279,7 @@ fn log_compaction_error( let decision = match e { ShuttingDown => None, + Offload(_) => Some(LooksLike::Error), _ if task_cancelled => Some(LooksLike::Info), Other(e) => { let root_cause = e.root_cause(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d765a7c987..12919866a3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -20,6 +20,7 @@ use chrono::{DateTime, Utc}; use enumset::EnumSet; use fail::fail_point; use handle::ShardTimelineId; +use offload::OffloadError; use once_cell::sync::Lazy; use pageserver_api::{ key::{ @@ -4475,11 +4476,23 @@ impl Drop for Timeline { pub(crate) enum CompactionError { #[error("The timeline or pageserver is shutting down")] ShuttingDown, + /// Compaction tried to offload a timeline and failed + #[error("Failed to offload timeline: {0}")] + Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. #[error(transparent)] Other(anyhow::Error), } +impl From for CompactionError { + fn from(e: OffloadError) -> Self { + match e { + OffloadError::Cancelled => Self::ShuttingDown, + _ => Self::Offload(e), + } + } +} + impl CompactionError { pub fn is_cancelled(&self) -> bool { matches!(self, CompactionError::ShuttingDown) diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 5b196cf8a7..c77c240000 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -3,18 +3,40 @@ use std::sync::Arc; use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded}; +use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; + +#[derive(thiserror::Error, Debug)] +pub(crate) enum OffloadError { + #[error("Cancelled")] + Cancelled, + #[error("Timeline is not archived")] + NotArchived, + #[error(transparent)] + RemoteStorage(anyhow::Error), + #[error("Unexpected offload error: {0}")] + Other(anyhow::Error), +} + +impl From for OffloadError { + fn from(e: TenantManifestError) -> Self { + match e { + TenantManifestError::Cancelled => Self::Cancelled, + TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e), + } + } +} pub(crate) async fn offload_timeline( tenant: &Tenant, timeline: &Arc, -) -> anyhow::Result<()> { +) -> Result<(), OffloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); let allow_offloaded_children = true; let (timeline, guard) = - DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)?; + DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children) + .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); @@ -26,14 +48,15 @@ pub(crate) async fn offload_timeline( Some(true) => (), Some(false) => { tracing::warn!(?is_archived, "tried offloading a non-archived timeline"); - anyhow::bail!("timeline isn't archived"); + return Err(OffloadError::NotArchived); } None => { - tracing::warn!( + // This is legal: calls to this function can race with the timeline shutting down + tracing::info!( ?is_archived, - "tried offloading a timeline where manifest is not yet available" + "tried offloading a timeline whose remote storage is not initialized" ); - anyhow::bail!("timeline manifest hasn't been loaded yet"); + return Err(OffloadError::Cancelled); } } @@ -44,9 +67,11 @@ pub(crate) async fn offload_timeline( // to make deletions possible while offloading is in progress let conf = &tenant.conf; - delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?; + delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline) + .await + .map_err(OffloadError::Other)?; - remove_timeline_from_tenant(tenant, &timeline, &guard).await?; + remove_timeline_from_tenant(tenant, &timeline, &guard); { let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap(); @@ -65,21 +90,18 @@ pub(crate) async fn offload_timeline( // at the next restart attach it again. // For that to happen, we'd need to make the manifest reflect our *intended* state, // not our actual state of offloaded timelines. - tenant - .store_tenant_manifest() - .await - .map_err(|e| anyhow::anyhow!(e))?; + tenant.store_tenant_manifest().await?; Ok(()) } /// It is important that this gets called when DeletionGuard is being held. /// For more context see comments in [`DeleteTimelineFlow::prepare`] -async fn remove_timeline_from_tenant( +fn remove_timeline_from_tenant( tenant: &Tenant, timeline: &Timeline, _: &DeletionGuard, // using it as a witness -) -> anyhow::Result<()> { +) { // Remove the timeline from the map. let mut timelines = tenant.timelines.lock().unwrap(); let children_exist = timelines @@ -95,8 +117,4 @@ async fn remove_timeline_from_tenant( timelines .remove(&timeline.timeline_id) .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); - - drop(timelines); - - Ok(()) } diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index c398289a5c..a93bdde3f8 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -141,7 +141,9 @@ impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { if let Some((_, create_guard)) = self.raw_timeline.take() { let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); - error!("Timeline got dropped without initializing, cleaning its files"); + // This is unusual, but can happen harmlessly if the pageserver is stopped while + // creating a timeline. + info!("Timeline got dropped without initializing, cleaning its files"); cleanup_timeline_directory(create_guard); } } diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 99e0e23b4a..124e62999a 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -103,7 +103,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) env.pageserver.allowed_errors.extend( [ ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*", - ".*Timeline got dropped without initializing, cleaning its files.*", ] ) @@ -145,7 +144,6 @@ def test_timeline_init_break_before_checkpoint_recreate( env.pageserver.allowed_errors.extend( [ ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*", - ".*Timeline got dropped without initializing, cleaning its files.*", ".*Failed to load index_part from remote storage, failed creation?.*", ] ) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index e367db33ff..743fa72aba 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -91,7 +91,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build [ ".*Failed to import basebackup.*", ".*unexpected non-zero bytes after the tar archive.*", - ".*Timeline got dropped without initializing, cleaning its files.*", ".*InternalServerError.*timeline not found.*", ".*InternalServerError.*Tenant .* not found.*", ".*InternalServerError.*Timeline .* not found.*", diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index f486327445..47df3ead70 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -146,8 +146,6 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE env.pageserver.allowed_errors.extend( [ - # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero - ".*Timeline got dropped without initializing, cleaning its files", # the response hit_pausable_failpoint_and_later_fail f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn", ]