pageserver: enable compaction to proceed while live-migrating (#5397)

## Problem Long ago, in #5299 the tenant states for migration are added, but respected only in a coarse-grained way: when hinted not to do deletions, tenants will just avoid doing all GC or compaction. Skipping compaction is not necessary for AttachedMulti, as we will soon become the primary attached location, and it is not a waste of resources to proceed with compaction. Instead, per the RFC https://github.com/neondatabase/neon/pull/5029/files), deletions should be queued up in this state, and executed later when we switch to AttachedSingle. Avoiding compaction in AttachedMulti can have an operational impact if a tenant is under significant write load, as a long-running migration can result in a large accumulation of delta layers with commensurate impact on read latency. Closes: https://github.com/neondatabase/neon/issues/5396 ## Summary of changes - Add a 'config' part to RemoteTimelineClient so that it can be aware of the mode of the tenant it belongs to, and wire this through for construction + updates - Add a special buffer for delayed deletions, and when in AttachedMulti route deletions here instead of into the main remote client queue. This is drained when transitioning to AttachedSingle. If the tenant is detached or our process dies before then, then these objects are leaked. - As a quality of life improvement, also use the remote timeline client's knowledge of the tenant state to avoid submitting remote consistent LSN updates for validation when in AttachedStale (as we know these will fail) ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist
2026-01-05 04:22:56 +00:00 · 2024-11-20 17:31:55 +00:00
parent 67f5f83edc
commit 5ff2f1ee7d
6 changed files with 167 additions and 22 deletions
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -189,6 +189,7 @@ pub struct TenantSharedResources {
 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
 /// for an attached tenant is a subset of the [`LocationConf`], represented
 /// in this struct.
+#[derive(Clone)]
 pub(super) struct AttachedTenantConf {
    tenant_conf: TenantConfOpt,
    location: AttachedLocationConfig,
@@ -1807,6 +1808,7 @@ impl Tenant {
            self.tenant_shard_id,
            timeline_id,
            self.generation,
+            &self.tenant_conf.load().location,
        )
    }

@@ -2527,6 +2529,10 @@ impl Tenant {
        {
            let conf = self.tenant_conf.load();

+            // If we may not delete layers, then simply skip GC.  Even though a tenant
+            // in AttachedMulti state could do GC and just enqueue the blocked deletions,
+            // the only advantage to doing it is to perhaps shrink the LayerMap metadata
+            // a bit sooner than we would achieve by waiting for AttachedSingle status.
            if !conf.location.may_delete_layers_hint() {
                info!("Skipping GC in location state {:?}", conf.location);
                return Ok(GcResult::default());
@@ -2568,7 +2574,14 @@ impl Tenant {

        {
            let conf = self.tenant_conf.load();
-            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
+
+            // Note that compaction usually requires deletions, but we don't respect
+            // may_delete_layers_hint here: that is because tenants in AttachedMulti
+            // should proceed with compaction even if they can't do deletion, to avoid
+            // accumulating dangerously deep stacks of L0 layers.  Deletions will be
+            // enqueued inside RemoteTimelineClient, and executed layer if/when we transition
+            // to AttachedSingle state.
+            if !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
                return Ok(false);
            }
@@ -3446,6 +3459,7 @@ impl Tenant {
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.
+
        self.tenant_conf.rcu(|inner| {
            Arc::new(AttachedTenantConf {
                tenant_conf: new_tenant_conf.clone(),
@@ -3455,20 +3469,22 @@ impl Tenant {
            })
        });

+        let updated = self.tenant_conf.load().clone();
+
        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated(&updated);
        }
    }

    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
        let new_tenant_conf = new_conf.tenant_conf.clone();

-        self.tenant_conf.store(Arc::new(new_conf));
+        self.tenant_conf.store(Arc::new(new_conf.clone()));

        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
@@ -3476,7 +3492,7 @@ impl Tenant {
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
-            timeline.tenant_conf_updated(&new_tenant_conf);
+            timeline.tenant_conf_updated(&new_conf);
        }
    }

@@ -4544,6 +4560,7 @@ impl Tenant {
            self.tenant_shard_id,
            timeline_id,
            self.generation,
+            &self.tenant_conf.load().location,
        )
    }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -241,6 +241,7 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

+use super::config::AttachedLocationConfig;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
@@ -302,6 +303,36 @@ pub enum WaitCompletionError {
 #[derive(Debug, thiserror::Error)]
 #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
 pub struct UploadQueueNotReadyError;
+/// Behavioral modes that enable seamless live migration.
+///
+/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
+struct RemoteTimelineClientConfig {
+    /// If this is false, then update to remote_consistent_lsn are dropped rather
+    /// than being submitted to DeletionQueue for validation.  This behavior is
+    /// used when a tenant attachment is known to have a stale generation number,
+    /// such that validation attempts will always fail.  This is not necessary
+    /// for correctness, but avoids spamming error statistics with failed validations
+    /// when doing migrations of tenants.
+    process_remote_consistent_lsn_updates: bool,
+
+    /// If this is true, then object deletions are held in a buffer in RemoteTimelineClient
+    /// rather than being submitted to the DeletionQueue.  This behavior is used when a tenant
+    /// is known to be multi-attached, in order to avoid disrupting other attached tenants
+    /// whose generations' metadata refers to the deleted objects.
+    block_deletions: bool,
+}
+
+/// RemoteTimelineClientConfig's state is entirely driven by LocationConf, but we do
+/// not carry the entire LocationConf structure: it's much more than we need.  The From
+/// impl extracts the subset of the LocationConf that is interesting to RemoteTimelineClient.
+impl From<&AttachedLocationConfig> for RemoteTimelineClientConfig {
+    fn from(lc: &AttachedLocationConfig) -> Self {
+        Self {
+            block_deletions: !lc.may_delete_layers_hint(),
+            process_remote_consistent_lsn_updates: lc.may_upload_layers_hint(),
+        }
+    }
+}

 /// A client for accessing a timeline's data in remote storage.
 ///
@@ -322,7 +353,7 @@ pub struct UploadQueueNotReadyError;
 /// in the index part file, whenever timeline metadata is uploaded.
 ///
 /// Downloads are not queued, they are performed immediately.
-pub struct RemoteTimelineClient {
+pub(crate) struct RemoteTimelineClient {
    conf: &'static PageServerConf,

    runtime: tokio::runtime::Handle,
@@ -339,6 +370,9 @@ pub struct RemoteTimelineClient {

    deletion_queue_client: DeletionQueueClient,

+    /// Subset of tenant configuration used to control upload behaviors during migrations
+    config: std::sync::RwLock<RemoteTimelineClientConfig>,
+
    cancel: CancellationToken,
 }

@@ -349,13 +383,14 @@ impl RemoteTimelineClient {
    /// Note: the caller must initialize the upload queue before any uploads can be scheduled,
    /// by calling init_upload_queue.
    ///
-    pub fn new(
+    pub(crate) fn new(
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        generation: Generation,
+        location_conf: &AttachedLocationConfig,
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
@@ -375,6 +410,7 @@ impl RemoteTimelineClient {
                &tenant_shard_id,
                &timeline_id,
            )),
+            config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
            cancel: CancellationToken::new(),
        }
    }
@@ -430,6 +466,43 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Notify this client of a change to its parent tenant's config, as this may cause us to
+    /// take action (unblocking deletions when transitioning from AttachedMulti to AttachedSingle)
+    pub(super) fn update_config(&self, location_conf: &AttachedLocationConfig) {
+        let new_conf = RemoteTimelineClientConfig::from(location_conf);
+        let unblocked = !new_conf.block_deletions;
+
+        // Update config before draining deletions, so that we don't race with more being
+        // inserted.  This can result in deletions happening our of order, but that does not
+        // violate any invariants: deletions only need to be ordered relative to upload of the index
+        // that dereferences the deleted objects, and we are not changing that order.
+        *self.config.write().unwrap() = new_conf;
+
+        if unblocked {
+            // If we may now delete layers, drain any that were blocked in our old
+            // configuration state
+            let mut queue_locked = self.upload_queue.lock().unwrap();
+
+            if let Ok(queue) = queue_locked.initialized_mut() {
+                let blocked_deletions = std::mem::take(&mut queue.blocked_deletions);
+                for d in blocked_deletions {
+                    if let Err(e) = self.deletion_queue_client.push_layers_sync(
+                        self.tenant_shard_id,
+                        self.timeline_id,
+                        self.generation,
+                        d.layers,
+                    ) {
+                        // This could happen if the pageserver is shut down while a tenant
+                        // is transitioning from a deletion-blocked state: we will leak some
+                        // S3 objects in this case.
+                        warn!("Failed to drain blocked deletions: {}", e);
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
    /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise.
    pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
        match &mut *self.upload_queue.lock().unwrap() {
@@ -1913,16 +1986,24 @@ impl RemoteTimelineClient {
                    res
                }
                UploadOp::Delete(delete) => {
-                    pausable_failpoint!("before-delete-layer-pausable");
-                    self.deletion_queue_client
-                        .push_layers(
-                            self.tenant_shard_id,
-                            self.timeline_id,
-                            self.generation,
-                            delete.layers.clone(),
-                        )
-                        .await
-                        .map_err(|e| anyhow::anyhow!(e))
+                    if self.config.read().unwrap().block_deletions {
+                        let mut queue_locked = self.upload_queue.lock().unwrap();
+                        if let Ok(queue) = queue_locked.initialized_mut() {
+                            queue.blocked_deletions.push(delete.clone());
+                        }
+                        Ok(())
+                    } else {
+                        pausable_failpoint!("before-delete-layer-pausable");
+                        self.deletion_queue_client
+                            .push_layers(
+                                self.tenant_shard_id,
+                                self.timeline_id,
+                                self.generation,
+                                delete.layers.clone(),
+                            )
+                            .await
+                            .map_err(|e| anyhow::anyhow!(e))
+                    }
                }
                unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
                    // unreachable. Barrier operations are handled synchronously in
@@ -2029,8 +2110,16 @@ impl RemoteTimelineClient {
                        // Legacy mode: skip validating generation
                        upload_queue.visible_remote_consistent_lsn.store(lsn);
                        None
-                    } else {
+                    } else if self
+                        .config
+                        .read()
+                        .unwrap()
+                        .process_remote_consistent_lsn_updates
+                    {
                        Some((lsn, upload_queue.visible_remote_consistent_lsn.clone()))
+                    } else {
+                        // Our config disables remote_consistent_lsn updates: drop it.
+                        None
                    }
                }
                UploadOp::Delete(_) => {
@@ -2167,6 +2256,7 @@ impl RemoteTimelineClient {
                        queued_operations: VecDeque::default(),
                        #[cfg(feature = "testing")]
                        dangling_files: HashMap::default(),
+                        blocked_deletions: Vec::new(),
                        shutting_down: false,
                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
                    };
@@ -2402,6 +2492,7 @@ mod tests {
    use crate::{
        context::RequestContext,
        tenant::{
+            config::AttachmentMode,
            harness::{TenantHarness, TIMELINE_ID},
            storage_layer::layer::local_layer_path,
            Tenant, Timeline,
@@ -2487,6 +2578,10 @@ mod tests {

        /// Construct a RemoteTimelineClient in an arbitrary generation
        fn build_client(&self, generation: Generation) -> Arc<RemoteTimelineClient> {
+            let location_conf = AttachedLocationConfig {
+                generation,
+                attach_mode: AttachmentMode::Single,
+            };
            Arc::new(RemoteTimelineClient {
                conf: self.harness.conf,
                runtime: tokio::runtime::Handle::current(),
@@ -2500,6 +2595,7 @@ mod tests {
                    &self.harness.tenant_shard_id,
                    &TIMELINE_ID,
                )),
+                config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
                cancel: CancellationToken::new(),
            })
        }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -273,7 +273,7 @@ pub struct Timeline {

    /// Remote storage client.
    /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
-    pub remote_client: Arc<RemoteTimelineClient>,
+    pub(crate) remote_client: Arc<RemoteTimelineClient>,

    // What page versions do we hold in the repository? If we get a
    // request > last_record_lsn, we need to wait until we receive all
@@ -2172,14 +2172,14 @@ impl Timeline {
            )
    }

-    pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
+    pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.

        // The threshold is embedded in the metric. So, we need to update it.
        {
            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                new_conf,
+                &new_conf.tenant_conf,
                &self.conf.default_tenant_conf,
            );

@@ -2187,6 +2187,9 @@ impl Timeline {
            let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());

            let timeline_id_str = self.timeline_id.to_string();
+
+            self.remote_client.update_config(&new_conf.location);
+
            self.metrics
                .evictions_with_low_residence_duration
                .write()
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -283,7 +283,7 @@ impl DeleteTimelineFlow {

    /// Shortcut to create Timeline in stopping state and spawn deletion task.
    #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn resume_deletion(
+    pub(crate) async fn resume_deletion(
        tenant: Arc<Tenant>,
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -88,6 +88,9 @@ pub(crate) struct UploadQueueInitialized {
    #[cfg(feature = "testing")]
    pub(crate) dangling_files: HashMap<LayerName, Generation>,

+    /// Deletions that are blocked by the tenant configuration
+    pub(crate) blocked_deletions: Vec<Delete>,
+
    /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
    pub(crate) shutting_down: bool,

@@ -180,6 +183,7 @@ impl UploadQueue {
            queued_operations: VecDeque::new(),
            #[cfg(feature = "testing")]
            dangling_files: HashMap::new(),
+            blocked_deletions: Vec::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
        };
@@ -220,6 +224,7 @@ impl UploadQueue {
            queued_operations: VecDeque::new(),
            #[cfg(feature = "testing")]
            dangling_files: HashMap::new(),
+            blocked_deletions: Vec::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
        };
@@ -270,7 +275,7 @@ pub(crate) struct UploadTask {

 /// A deletion of some layers within the lifetime of a timeline.  This is not used
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct Delete {
    pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
 }
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -365,6 +365,19 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
    workload.validate(pageserver_a.id)
    workload.validate(pageserver_b.id)

+    # Force compaction on destination pageserver
+    pageserver_b.http_client().timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
+
+    # Destination pageserver is in AttachedMulti, it should have generated deletions but
+    # not enqueued them yet.
+    # Check deletion metrics via prometheus - should be 0 since we're in AttachedMulti
+    assert (
+        pageserver_b.http_client().get_metric_value(
+            "pageserver_deletion_queue_submitted_total",
+        )
+        == 0
+    )
+
    # Revert the origin to secondary
    log.info("Setting origin to Secondary")
    pageserver_a.tenant_location_configure(
@@ -389,6 +402,17 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
        },
    )

+    # Transition to AttachedSingle should have drained deletions generated by doing a compaction
+    # while in AttachedMulti.
+    def blocked_deletions_drained():
+        submitted = pageserver_b.http_client().get_metric_value(
+            "pageserver_deletion_queue_submitted_total"
+        )
+        assert submitted is not None
+        assert submitted > 0
+
+    wait_until(10, 0.1, blocked_deletions_drained)
+
    workload.churn_rows(64, pageserver_b.id)
    workload.validate(pageserver_b.id)
    del workload