From 7a1331eee56a1590ef4fb73f07e70c013c7d9c84 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 29 Oct 2024 13:54:48 +0000 Subject: [PATCH] pageserver: make concurrent offloaded timeline operations safe wrt manifest uploads (#9557) ## Problem Uploads of the tenant manifest could race between different tasks, resulting in unexpected results in remote storage. Closes: https://github.com/neondatabase/neon/issues/9556 ## Summary of changes - Create a central function for uploads that takes a tokio::sync::Mutex - Store the latest upload in that Mutex, so that when there is lots of concurrency (e.g. archive 20 timelines at once) we can coalesce their manifest writes somewhat. --- pageserver/src/tenant.rs | 100 +++++++++++++----- .../src/tenant/remote_timeline_client.rs | 2 +- .../tenant/remote_timeline_client/manifest.rs | 4 +- pageserver/src/tenant/timeline/delete.rs | 43 +++----- pageserver/src/tenant/timeline/offload.rs | 17 +-- 5 files changed, 94 insertions(+), 72 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7f8af67c2c..64e4eb46ce 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -302,6 +302,13 @@ pub struct Tenant { /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_offloaded: Mutex>>, + /// Serialize writes of the tenant manifest to remote storage. If there are concurrent operations + /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for + /// each other (this could be optimized to coalesce writes if necessary). + /// + /// The contents of the Mutex are the last manifest we successfully uploaded + tenant_manifest_upload: tokio::sync::Mutex>, + // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration @@ -741,6 +748,24 @@ pub enum TimelineArchivalError { Other(anyhow::Error), } +#[derive(thiserror::Error, Debug)] +pub(crate) enum TenantManifestError { + #[error("Remote storage error: {0}")] + RemoteStorage(anyhow::Error), + + #[error("Cancelled")] + Cancelled, +} + +impl From for TimelineArchivalError { + fn from(e: TenantManifestError) -> Self { + match e { + TenantManifestError::RemoteStorage(e) => Self::Other(e), + TenantManifestError::Cancelled => Self::Cancelled, + } + } +} + impl Debug for TimelineArchivalError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -1526,18 +1551,7 @@ impl Tenant { offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter()); } if !offloaded_timeline_ids.is_empty() { - let manifest = self.tenant_manifest(); - // TODO: generation support - let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; - upload_tenant_manifest( - &self.remote_storage, - &self.tenant_shard_id, - generation, - &manifest, - &self.cancel, - ) - .await - .map_err(TimelineArchivalError::Other)?; + self.store_tenant_manifest().await?; } // The local filesystem contents are a cache of what's in the remote IndexPart; @@ -1918,18 +1932,7 @@ impl Tenant { }; // Upload new list of offloaded timelines to S3 - let manifest = self.tenant_manifest(); - // TODO: generation support - let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; - upload_tenant_manifest( - &self.remote_storage, - &self.tenant_shard_id, - generation, - &manifest, - &cancel, - ) - .await - .map_err(TimelineArchivalError::Other)?; + self.store_tenant_manifest().await?; // Activate the timeline (if it makes sense) if !(timeline.is_broken() || timeline.is_stopping()) { @@ -3126,7 +3129,7 @@ impl Tenant { } } - let tenant_manifest = self.tenant_manifest(); + let tenant_manifest = self.build_tenant_manifest(); // TODO: generation support let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; for child_shard in child_shards { @@ -3321,7 +3324,8 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } - pub(crate) fn tenant_manifest(&self) -> TenantManifest { + /// Generate an up-to-date TenantManifest based on the state of this Tenant. + fn build_tenant_manifest(&self) -> TenantManifest { let timelines_offloaded = self.timelines_offloaded.lock().unwrap(); let mut timeline_manifests = timelines_offloaded @@ -3529,6 +3533,7 @@ impl Tenant { timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), timelines_offloaded: Mutex::new(HashMap::new()), + tenant_manifest_upload: Default::default(), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, @@ -4708,6 +4713,49 @@ impl Tenant { .max() .unwrap_or(0) } + + /// Serialize and write the latest TenantManifest to remote storage. + pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> { + // Only one manifest write may be done at at time, and the contents of the manifest + // must be loaded while holding this lock. This makes it safe to call this function + // from anywhere without worrying about colliding updates. + let mut guard = tokio::select! { + g = self.tenant_manifest_upload.lock() => { + g + }, + _ = self.cancel.cancelled() => { + return Err(TenantManifestError::Cancelled); + } + }; + + let manifest = self.build_tenant_manifest(); + if Some(&manifest) == (*guard).as_ref() { + // Optimisation: skip uploads that don't change anything. + return Ok(()); + } + + upload_tenant_manifest( + &self.remote_storage, + &self.tenant_shard_id, + self.generation, + &manifest, + &self.cancel, + ) + .await + .map_err(|e| { + if self.cancel.is_cancelled() { + TenantManifestError::Cancelled + } else { + TenantManifestError::RemoteStorage(e) + } + })?; + + // Store the successfully uploaded manifest, so that future callers can avoid + // re-uploading the same thing. + *guard = Some(manifest); + + Ok(()) + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1c72c7fff8..19e762b9fa 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -249,7 +249,7 @@ pub(crate) use download::{ list_remote_tenant_shards, list_remote_timelines, }; pub(crate) use index::LayerFileMetadata; -pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest}; +pub(crate) use upload::upload_initdb_dir; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 7d92d45146..c4382cb648 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use utils::{id::TimelineId, lsn::Lsn}; /// Tenant-shard scoped manifest -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct TenantManifest { /// Debugging aid describing the version of this manifest. /// Can also be used for distinguishing breaking changes later on. @@ -23,7 +23,7 @@ pub struct TenantManifest { /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`], /// but the two datastructures serve different needs, this is for a persistent disk format /// that must be backwards compatible, while the other is only for informative purposes. -#[derive(Clone, Serialize, Deserialize, Copy)] +#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)] pub struct OffloadedTimelineManifest { pub timeline_id: TimelineId, /// Whether the timeline has a parent it has been branched off from or not diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 53b65da515..2c6161da15 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -14,10 +14,9 @@ use crate::{ task_mgr::{self, TaskKind}, tenant::{ metadata::TimelineMetadata, - remote_timeline_client::{ - self, MaybeDeletedIndexPart, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, - }, - CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded, + remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, + CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, + TimelineOrOffloaded, }, }; @@ -176,32 +175,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant( Ok(()) } -/// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] -async fn upload_new_tenant_manifest( - tenant: &Tenant, - _: &DeletionGuard, // using it as a witness -) -> anyhow::Result<()> { - // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash - // between the deletion of the index-part.json and reaching of this code. - // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted. - // However, we handle this case in tenant loading code so the next time we attach, the issue is - // resolved. - let manifest = tenant.tenant_manifest(); - // TODO: generation support - let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; - remote_timeline_client::upload_tenant_manifest( - &tenant.remote_storage, - &tenant.tenant_shard_id, - generation, - &manifest, - &tenant.cancel, - ) - .await?; - - Ok(()) -} - /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures, /// and deletes its data from both disk and s3. /// The sequence of steps: @@ -480,7 +453,15 @@ impl DeleteTimelineFlow { remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?; - upload_new_tenant_manifest(tenant, &guard).await?; + // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash + // between the deletion of the index-part.json and reaching of this code. + // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted. + // However, we handle this case in tenant loading code so the next time we attach, the issue is + // resolved. + tenant + .store_tenant_manifest() + .await + .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?; *guard = Self::Finished; diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 8e6eceb084..305c139b54 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded}; +use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded}; pub(crate) async fn offload_timeline( tenant: &Tenant, @@ -63,17 +63,10 @@ pub(crate) async fn offload_timeline( // at the next restart attach it again. // For that to happen, we'd need to make the manifest reflect our *intended* state, // not our actual state of offloaded timelines. - let manifest = tenant.tenant_manifest(); - // TODO: generation support - let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; - remote_timeline_client::upload_tenant_manifest( - &tenant.remote_storage, - &tenant.tenant_shard_id, - generation, - &manifest, - &tenant.cancel, - ) - .await?; + tenant + .store_tenant_manifest() + .await + .map_err(|e| anyhow::anyhow!(e))?; Ok(()) }