From 3f676df3d519107ee09b54772652be38734608c1 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 19 Jun 2025 12:53:18 +0300 Subject: [PATCH] pageserver: fix initial layer visibility calculation (#12206) ## Problem GC info is an input to updating layer visibility. Currently, gc info is updated on timeline activation and visibility is computed on tenant attach, so we ignore branch points and compute visibility by taking all layers into account. Side note: gc info is also updated when timelines are created and dropped. That doesn't help because we create the timelines in topological order from the root. Hence the root timeline goes first, without context of where the branch points are. The impact of this in prod is that shards need to rehydrate layers after live migration since the non-visible ones were excluded from the heatmap. ## Summary of Changes Move the visibility calculation into tenant attachment instead of activation. --- pageserver/src/tenant.rs | 30 +++++++++++++++++++++++------- pageserver/src/tenant/timeline.rs | 6 +----- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cfecf5561c..dc6926f299 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1859,6 +1859,29 @@ impl TenantShard { } } + // At this point we've initialized all timelines and are tracking them. + // Now compute the layer visibility for all (not offloaded) timelines. + let compute_visiblity_for = { + let timelines_accessor = self.timelines.lock().unwrap(); + let mut timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); + + timelines_offloaded_accessor.extend(offloaded_timelines_list.into_iter()); + + // Before activation, populate each Timeline's GcInfo with information about its children + self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None); + + timelines_accessor.values().cloned().collect::>() + }; + + for tl in compute_visiblity_for { + tl.update_layer_visibility().await.with_context(|| { + format!( + "failed initial timeline visibility computation {} for tenant {}", + tl.timeline_id, self.tenant_shard_id + ) + })?; + } + // Walk through deleted timelines, resume deletion for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions { remote_timeline_client @@ -1878,10 +1901,6 @@ impl TenantShard { .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; } - { - let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap(); - offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter()); - } // Stash the preloaded tenant manifest, and upload a new manifest if changed. // @@ -3449,9 +3468,6 @@ impl TenantShard { .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); - // Before activation, populate each Timeline's GcInfo with information about its children - self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor, None); - // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. tasks::start_background_loops(self, background_jobs_can_start); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a1969ecae6..b8bfc4f936 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3422,10 +3422,6 @@ impl Timeline { // TenantShard::create_timeline will wait for these uploads to happen before returning, or // on retry. - // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan) - drop(guard); // drop write lock, update_layer_visibility will take a read lock. - self.update_layer_visibility().await?; - info!( "loaded layer map with {} layers at {}, total physical size: {}", num_layers, disk_consistent_lsn, total_physical_size @@ -5939,7 +5935,7 @@ impl Drop for Timeline { if let Ok(mut gc_info) = ancestor.gc_info.write() { if !gc_info.remove_child_not_offloaded(self.timeline_id) { tracing::error!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id, - "Couldn't remove retain_lsn entry from offloaded timeline's parent: already removed"); + "Couldn't remove retain_lsn entry from timeline's parent on drop: already removed"); } } }