diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8e9e3890ba..2e5f69e3c9 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -189,6 +189,7 @@ pub struct TenantSharedResources { /// A [`Tenant`] is really an _attached_ tenant. The configuration /// for an attached tenant is a subset of the [`LocationConf`], represented /// in this struct. +#[derive(Clone)] pub(super) struct AttachedTenantConf { tenant_conf: TenantConfOpt, location: AttachedLocationConfig, @@ -1807,6 +1808,7 @@ impl Tenant { self.tenant_shard_id, timeline_id, self.generation, + &self.tenant_conf.load().location, ) } @@ -2527,6 +2529,10 @@ impl Tenant { { let conf = self.tenant_conf.load(); + // If we may not delete layers, then simply skip GC. Even though a tenant + // in AttachedMulti state could do GC and just enqueue the blocked deletions, + // the only advantage to doing it is to perhaps shrink the LayerMap metadata + // a bit sooner than we would achieve by waiting for AttachedSingle status. if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); return Ok(GcResult::default()); @@ -2568,7 +2574,14 @@ impl Tenant { { let conf = self.tenant_conf.load(); - if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { + + // Note that compaction usually requires deletions, but we don't respect + // may_delete_layers_hint here: that is because tenants in AttachedMulti + // should proceed with compaction even if they can't do deletion, to avoid + // accumulating dangerously deep stacks of L0 layers. Deletions will be + // enqueued inside RemoteTimelineClient, and executed layer if/when we transition + // to AttachedSingle state. + if !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); return Ok(false); } @@ -3446,6 +3459,7 @@ impl Tenant { // this race is not possible if both request types come from the storage // controller (as they should!) because an exclusive op lock is required // on the storage controller side. + self.tenant_conf.rcu(|inner| { Arc::new(AttachedTenantConf { tenant_conf: new_tenant_conf.clone(), @@ -3455,20 +3469,22 @@ impl Tenant { }) }); + let updated = self.tenant_conf.load().clone(); + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(&new_tenant_conf); + timeline.tenant_conf_updated(&updated); } } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { let new_tenant_conf = new_conf.tenant_conf.clone(); - self.tenant_conf.store(Arc::new(new_conf)); + self.tenant_conf.store(Arc::new(new_conf.clone())); self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. @@ -3476,7 +3492,7 @@ impl Tenant { // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(&new_tenant_conf); + timeline.tenant_conf_updated(&new_conf); } } @@ -4544,6 +4560,7 @@ impl Tenant { self.tenant_shard_id, timeline_id, self.generation, + &self.tenant_conf.load().location, ) } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index b910a40547..377bc23542 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -241,6 +241,7 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; +use super::config::AttachedLocationConfig; use super::metadata::MetadataUpdate; use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::upload_queue::{NotInitialized, SetDeletedFlagProgress}; @@ -302,6 +303,36 @@ pub enum WaitCompletionError { #[derive(Debug, thiserror::Error)] #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")] pub struct UploadQueueNotReadyError; +/// Behavioral modes that enable seamless live migration. +/// +/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in. +struct RemoteTimelineClientConfig { + /// If this is false, then update to remote_consistent_lsn are dropped rather + /// than being submitted to DeletionQueue for validation. This behavior is + /// used when a tenant attachment is known to have a stale generation number, + /// such that validation attempts will always fail. This is not necessary + /// for correctness, but avoids spamming error statistics with failed validations + /// when doing migrations of tenants. + process_remote_consistent_lsn_updates: bool, + + /// If this is true, then object deletions are held in a buffer in RemoteTimelineClient + /// rather than being submitted to the DeletionQueue. This behavior is used when a tenant + /// is known to be multi-attached, in order to avoid disrupting other attached tenants + /// whose generations' metadata refers to the deleted objects. + block_deletions: bool, +} + +/// RemoteTimelineClientConfig's state is entirely driven by LocationConf, but we do +/// not carry the entire LocationConf structure: it's much more than we need. The From +/// impl extracts the subset of the LocationConf that is interesting to RemoteTimelineClient. +impl From<&AttachedLocationConfig> for RemoteTimelineClientConfig { + fn from(lc: &AttachedLocationConfig) -> Self { + Self { + block_deletions: !lc.may_delete_layers_hint(), + process_remote_consistent_lsn_updates: lc.may_upload_layers_hint(), + } + } +} /// A client for accessing a timeline's data in remote storage. /// @@ -322,7 +353,7 @@ pub struct UploadQueueNotReadyError; /// in the index part file, whenever timeline metadata is uploaded. /// /// Downloads are not queued, they are performed immediately. -pub struct RemoteTimelineClient { +pub(crate) struct RemoteTimelineClient { conf: &'static PageServerConf, runtime: tokio::runtime::Handle, @@ -339,6 +370,9 @@ pub struct RemoteTimelineClient { deletion_queue_client: DeletionQueueClient, + /// Subset of tenant configuration used to control upload behaviors during migrations + config: std::sync::RwLock, + cancel: CancellationToken, } @@ -349,13 +383,14 @@ impl RemoteTimelineClient { /// Note: the caller must initialize the upload queue before any uploads can be scheduled, /// by calling init_upload_queue. /// - pub fn new( + pub(crate) fn new( remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, generation: Generation, + location_conf: &AttachedLocationConfig, ) -> RemoteTimelineClient { RemoteTimelineClient { conf, @@ -375,6 +410,7 @@ impl RemoteTimelineClient { &tenant_shard_id, &timeline_id, )), + config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)), cancel: CancellationToken::new(), } } @@ -430,6 +466,43 @@ impl RemoteTimelineClient { Ok(()) } + /// Notify this client of a change to its parent tenant's config, as this may cause us to + /// take action (unblocking deletions when transitioning from AttachedMulti to AttachedSingle) + pub(super) fn update_config(&self, location_conf: &AttachedLocationConfig) { + let new_conf = RemoteTimelineClientConfig::from(location_conf); + let unblocked = !new_conf.block_deletions; + + // Update config before draining deletions, so that we don't race with more being + // inserted. This can result in deletions happening our of order, but that does not + // violate any invariants: deletions only need to be ordered relative to upload of the index + // that dereferences the deleted objects, and we are not changing that order. + *self.config.write().unwrap() = new_conf; + + if unblocked { + // If we may now delete layers, drain any that were blocked in our old + // configuration state + let mut queue_locked = self.upload_queue.lock().unwrap(); + + if let Ok(queue) = queue_locked.initialized_mut() { + let blocked_deletions = std::mem::take(&mut queue.blocked_deletions); + for d in blocked_deletions { + if let Err(e) = self.deletion_queue_client.push_layers_sync( + self.tenant_shard_id, + self.timeline_id, + self.generation, + d.layers, + ) { + // This could happen if the pageserver is shut down while a tenant + // is transitioning from a deletion-blocked state: we will leak some + // S3 objects in this case. + warn!("Failed to drain blocked deletions: {}", e); + break; + } + } + } + } + } + /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise. pub fn remote_consistent_lsn_projected(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { @@ -1913,16 +1986,24 @@ impl RemoteTimelineClient { res } UploadOp::Delete(delete) => { - pausable_failpoint!("before-delete-layer-pausable"); - self.deletion_queue_client - .push_layers( - self.tenant_shard_id, - self.timeline_id, - self.generation, - delete.layers.clone(), - ) - .await - .map_err(|e| anyhow::anyhow!(e)) + if self.config.read().unwrap().block_deletions { + let mut queue_locked = self.upload_queue.lock().unwrap(); + if let Ok(queue) = queue_locked.initialized_mut() { + queue.blocked_deletions.push(delete.clone()); + } + Ok(()) + } else { + pausable_failpoint!("before-delete-layer-pausable"); + self.deletion_queue_client + .push_layers( + self.tenant_shard_id, + self.timeline_id, + self.generation, + delete.layers.clone(), + ) + .await + .map_err(|e| anyhow::anyhow!(e)) + } } unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => { // unreachable. Barrier operations are handled synchronously in @@ -2029,8 +2110,16 @@ impl RemoteTimelineClient { // Legacy mode: skip validating generation upload_queue.visible_remote_consistent_lsn.store(lsn); None - } else { + } else if self + .config + .read() + .unwrap() + .process_remote_consistent_lsn_updates + { Some((lsn, upload_queue.visible_remote_consistent_lsn.clone())) + } else { + // Our config disables remote_consistent_lsn updates: drop it. + None } } UploadOp::Delete(_) => { @@ -2167,6 +2256,7 @@ impl RemoteTimelineClient { queued_operations: VecDeque::default(), #[cfg(feature = "testing")] dangling_files: HashMap::default(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; @@ -2402,6 +2492,7 @@ mod tests { use crate::{ context::RequestContext, tenant::{ + config::AttachmentMode, harness::{TenantHarness, TIMELINE_ID}, storage_layer::layer::local_layer_path, Tenant, Timeline, @@ -2487,6 +2578,10 @@ mod tests { /// Construct a RemoteTimelineClient in an arbitrary generation fn build_client(&self, generation: Generation) -> Arc { + let location_conf = AttachedLocationConfig { + generation, + attach_mode: AttachmentMode::Single, + }; Arc::new(RemoteTimelineClient { conf: self.harness.conf, runtime: tokio::runtime::Handle::current(), @@ -2500,6 +2595,7 @@ mod tests { &self.harness.tenant_shard_id, &TIMELINE_ID, )), + config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)), cancel: CancellationToken::new(), }) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a4289a222f..95864af4d0 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -273,7 +273,7 @@ pub struct Timeline { /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. - pub remote_client: Arc, + pub(crate) remote_client: Arc, // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all @@ -2172,14 +2172,14 @@ impl Timeline { ) } - pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - new_conf, + &new_conf.tenant_conf, &self.conf.default_tenant_conf, ); @@ -2187,6 +2187,9 @@ impl Timeline { let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug()); let timeline_id_str = self.timeline_id.to_string(); + + self.remote_client.update_config(&new_conf.location); + self.metrics .evictions_with_low_residence_duration .write() diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 13a8dfa51a..67fc710c44 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -283,7 +283,7 @@ impl DeleteTimelineFlow { /// Shortcut to create Timeline in stopping state and spawn deletion task. #[instrument(skip_all, fields(%timeline_id))] - pub async fn resume_deletion( + pub(crate) async fn resume_deletion( tenant: Arc, timeline_id: TimelineId, local_metadata: &TimelineMetadata, diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 592f41cb21..f14bf2f8c3 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -88,6 +88,9 @@ pub(crate) struct UploadQueueInitialized { #[cfg(feature = "testing")] pub(crate) dangling_files: HashMap, + /// Deletions that are blocked by the tenant configuration + pub(crate) blocked_deletions: Vec, + /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -180,6 +183,7 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; @@ -220,6 +224,7 @@ impl UploadQueue { queued_operations: VecDeque::new(), #[cfg(feature = "testing")] dangling_files: HashMap::new(), + blocked_deletions: Vec::new(), shutting_down: false, shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)), }; @@ -270,7 +275,7 @@ pub(crate) struct UploadTask { /// A deletion of some layers within the lifetime of a timeline. This is not used /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct Delete { pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index d4aef96735..12134048e6 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -365,6 +365,19 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): workload.validate(pageserver_a.id) workload.validate(pageserver_b.id) + # Force compaction on destination pageserver + pageserver_b.http_client().timeline_compact(tenant_id, timeline_id, force_l0_compaction=True) + + # Destination pageserver is in AttachedMulti, it should have generated deletions but + # not enqueued them yet. + # Check deletion metrics via prometheus - should be 0 since we're in AttachedMulti + assert ( + pageserver_b.http_client().get_metric_value( + "pageserver_deletion_queue_submitted_total", + ) + == 0 + ) + # Revert the origin to secondary log.info("Setting origin to Secondary") pageserver_a.tenant_location_configure( @@ -389,6 +402,17 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): }, ) + # Transition to AttachedSingle should have drained deletions generated by doing a compaction + # while in AttachedMulti. + def blocked_deletions_drained(): + submitted = pageserver_b.http_client().get_metric_value( + "pageserver_deletion_queue_submitted_total" + ) + assert submitted is not None + assert submitted > 0 + + wait_until(10, 0.1, blocked_deletions_drained) + workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) del workload