From 983d56502bb84d18288cc6498a258896e858a38c Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 15 Apr 2025 16:26:29 +0200 Subject: [PATCH] pageserver: reduce shard ancestor rewrite threshold to 30% (#11582) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem When doing power-of-two shard splits (i.e. 4 → 8 → 16), we end up rewriting all layers since half of the pages will be local due to striping. This causes a lot of resource usage when splitting large tenants. ## Summary of changes Drop the threshold of local/total pages to 30%, to reduce the amount of layer rewrites after splits. --- pageserver/src/tenant/timeline/compaction.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 5d5149e2d4..76c153d60f 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -70,6 +70,13 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile}; /// Maximum number of deltas before generating an image layer in bottom-most compaction. const COMPACTION_DELTA_THRESHOLD: usize = 5; +/// Ratio of shard-local pages below which we trigger shard ancestor layer rewrites. 0.3 means that +/// <= 30% of layer pages must belong to the descendant shard to rewrite the layer. +/// +/// We choose a value < 0.5 to avoid rewriting all visible layers every time we do a power-of-two +/// shard split, which gets expensive for large tenants. +const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3; + #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub struct GcCompactionJobId(pub usize); @@ -1330,14 +1337,15 @@ impl Timeline { continue; } - // Don't bother re-writing a layer unless it will at least halve its size + // Only rewrite a layer if we can reclaim significant space. if layer_local_page_count != u32::MAX - && layer_local_page_count > layer_raw_page_count / 2 + && layer_local_page_count as f64 / layer_raw_page_count as f64 + <= ANCESTOR_COMPACTION_REWRITE_THRESHOLD { debug!(%layer, - "layer is already mostly local ({}/{}), not rewriting", - layer_local_page_count, - layer_raw_page_count + "layer has a large share of local pages \ + ({layer_local_page_count}/{layer_raw_page_count} > \ + {ANCESTOR_COMPACTION_REWRITE_THRESHOLD}), not rewriting", ); }