remove gc_feedback mechanism (#6863)

It's been dead-code-at-runtime for 9 months, let's remove it. We can always re-introduce it at a later point. Came across this while working on #6861, which will touch `time_for_new_image_layer`. This is an opporunity to make that function simpler.
2026-01-09 14:32:57 +00:00 · 2024-02-26 10:05:24 +01:00
parent 8283779ee8
commit dedf66ba5b
8 changed files with 6 additions and 87 deletions
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -391,11 +391,6 @@ impl PageServerNode {
            evictions_low_residence_duration_metric_threshold: settings
                .remove("evictions_low_residence_duration_metric_threshold")
                .map(|x| x.to_string()),
-            gc_feedback: settings
-                .remove("gc_feedback")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'gc_feedback' as bool")?,
            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
            lazy_slru_download: settings
                .remove("lazy_slru_download")
@@ -501,11 +496,6 @@ impl PageServerNode {
                evictions_low_residence_duration_metric_threshold: settings
                    .remove("evictions_low_residence_duration_metric_threshold")
                    .map(|x| x.to_string()),
-                gc_feedback: settings
-                    .remove("gc_feedback")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'gc_feedback' as bool")?,
                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
                lazy_slru_download: settings
                    .remove("lazy_slru_download")
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -283,7 +283,6 @@ pub struct TenantConfig {
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
-    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -140,7 +140,6 @@ pub mod defaults {

 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-#gc_feedback = false

 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3641,7 +3641,6 @@ pub(crate) mod harness {
                evictions_low_residence_duration_metric_threshold: Some(
                    tenant_conf.evictions_low_residence_duration_metric_threshold,
                ),
-                gc_feedback: Some(tenant_conf.gc_feedback),
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -339,7 +339,6 @@ pub struct TenantConf {
    // See the corresponding metric's help string.
    #[serde(with = "humantime_serde")]
    pub evictions_low_residence_duration_metric_threshold: Duration,
-    pub gc_feedback: bool,

    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
    /// may be disabled if a Tenant will not have secondary locations: only secondary
@@ -427,10 +426,6 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub gc_feedback: Option<bool>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    #[serde(default)]
@@ -485,7 +480,6 @@ impl TenantConfOpt {
            evictions_low_residence_duration_metric_threshold: self
                .evictions_low_residence_duration_metric_threshold
                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
-            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
            heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
            lazy_slru_download: self
                .lazy_slru_download
@@ -530,7 +524,6 @@ impl Default for TenantConf {
                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
            )
            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            gc_feedback: false,
            heatmap_period: Duration::ZERO,
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
@@ -603,7 +596,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            evictions_low_residence_duration_metric_threshold: value
                .evictions_low_residence_duration_metric_threshold
                .map(humantime),
-            gc_feedback: value.gc_feedback,
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -76,7 +76,7 @@ use crate::{
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};

 use crate::config::PageServerConf;
-use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
+use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
@@ -210,17 +210,6 @@ pub struct Timeline {
    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
    pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,

-    /// Set of key ranges which should be covered by image layers to
-    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
-    /// It is used by compaction task when it checks if new image layer should be created.
-    /// Newly created image layer doesn't help to remove the delta layer, until the
-    /// newly created image layer falls off the PITR horizon. So on next GC cycle,
-    /// gc_timeline may still want the new image layer to be created. To avoid redundant
-    /// image layers creation we should check if image layer exists but beyond PITR horizon.
-    /// This is why we need remember GC cutoff LSN.
-    ///
-    wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
-
    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
    last_freeze_ts: RwLock<Instant>,
@@ -1516,13 +1505,6 @@ impl Timeline {
            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
    }

-    fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf.clone();
-        tenant_conf
-            .gc_feedback
-            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
-    }
-
    pub(super) fn tenant_conf_updated(&self) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.
@@ -1596,7 +1578,6 @@ impl Timeline {
                shard_identity,
                pg_version,
                layers: Default::default(),
-                wanted_image_layers: Mutex::new(None),

                walredo_mgr,
                walreceiver: Mutex::new(None),
@@ -3408,31 +3389,6 @@ impl Timeline {
        let layers = guard.layer_map();

        let mut max_deltas = 0;
-        {
-            let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
-            if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
-                let img_range =
-                    partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
-                if wanted.overlaps(&img_range) {
-                    //
-                    // gc_timeline only pays attention to image layers that are older than the GC cutoff,
-                    // but create_image_layers creates image layers at last-record-lsn.
-                    // So it's possible that gc_timeline wants a new image layer to be created for a key range,
-                    // but the range is already covered by image layers at more recent LSNs. Before we
-                    // create a new image layer, check if the range is already covered at more recent LSNs.
-                    if !layers
-                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
-                    {
-                        debug!(
-                            "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
-                            img_range.start, img_range.end, cutoff_lsn, lsn
-                        );
-                        return true;
-                    }
-                }
-            }
-        }
-
        for part_range in &partition.ranges {
            let image_coverage = layers.image_coverage(part_range, lsn);
            for (img_range, last_img) in image_coverage {
@@ -3603,12 +3559,6 @@ impl Timeline {
                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
            }
        }
-        // All layers that the GC wanted us to create have now been created.
-        //
-        // It's possible that another GC cycle happened while we were compacting, and added
-        // something new to wanted_image_layers, and we now clear that before processing it.
-        // That's OK, because the next GC iteration will put it back in.
-        *self.wanted_image_layers.lock().unwrap() = None;

        // Sync the new layer to disk before adding it to the layer map, to make sure
        // we don't garbage collect something based on the new layer, before it has
@@ -4518,7 +4468,6 @@ impl Timeline {
        debug!("retain_lsns: {:?}", retain_lsns);

        let mut layers_to_remove = Vec::new();
-        let mut wanted_image_layers = KeySpaceRandomAccum::default();

        // Scan all layers in the timeline (remote or on-disk).
        //
@@ -4600,15 +4549,6 @@ impl Timeline {
                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
            {
                debug!("keeping {} because it is the latest layer", l.filename());
-                // Collect delta key ranges that need image layers to allow garbage
-                // collecting the layers.
-                // It is not so obvious whether we need to propagate information only about
-                // delta layers. Image layers can form "stairs" preventing old image from been deleted.
-                // But image layers are in any case less sparse than delta layers. Also we need some
-                // protection from replacing recent image layers with new one after each GC iteration.
-                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) {
-                    wanted_image_layers.add_range(l.get_key_range());
-                }
                result.layers_not_updated += 1;
                continue 'outer;
            }
@@ -4621,10 +4561,6 @@ impl Timeline {
            );
            layers_to_remove.push(l);
        }
-        self.wanted_image_layers
-            .lock()
-            .unwrap()
-            .replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));

        if !layers_to_remove.is_empty() {
            // Persist the new GC cutoff value before we actually remove anything.
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -13,6 +13,11 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
    Information about image layers needed to collect old layers should
    be propagated by GC to compaction task which should take in in account
    when make a decision which new image layers needs to be created.
+
+    NB: this test demonstrates the problem. The source tree contained the
+    `gc_feedback` mechanism for about 9 months, but, there were problems
+    with it and it wasn't enabled at runtime.
+    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
    """
    env = neon_env_builder.init_start()
    client = env.pageserver.http_client()
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -166,7 +166,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
            "threshold": "23h",
        },
        "evictions_low_residence_duration_metric_threshold": "2days",
-        "gc_feedback": True,
        "gc_horizon": 23 * (1024 * 1024),
        "gc_period": "2h 13m",
        "heatmap_period": "10m",