From 090123a4292d56c811a39a7a59a918b7114fd85f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Mar 2024 17:44:55 +0000
Subject: [PATCH] pageserver: check for new image layers based on ingested WAL
 (#7230)

## Problem
Part of the legacy (but current) compaction algorithm is to find a stack
of overlapping delta layers which will be turned
into an image layer. This operation is exponential in terms of the
number of matching layers and we do it roughly every 20 seconds.

## Summary of changes
Only check if a new image layer is required if we've ingested a certain
amount of WAL since the last check.
The amount of wal is expressed in terms of multiples of checkpoint
distance, with the intuition being that
that there's little point doing the check if we only have two new L1
layers (not enough to create a new image).
---
 control_plane/src/pageserver.rs               | 10 ++++++
 libs/pageserver_api/src/models.rs             |  1 +
 pageserver/src/tenant.rs                      |  3 ++
 pageserver/src/tenant/config.rs               | 15 +++++++++
 pageserver/src/tenant/timeline.rs             | 31 +++++++++++++++++++
 .../regress/test_attach_tenant_config.py      |  1 +
 test_runner/regress/test_layer_eviction.py    |  1 +
 .../regress/test_layers_from_future.py        |  1 +
 test_runner/regress/test_ondemand_download.py |  5 ++-
 .../regress/test_pageserver_generations.py    |  1 +
 test_runner/regress/test_remote_storage.py    |  1 +
 11 files changed, 69 insertions(+), 1 deletion(-)
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index c5eabc46db..abf815f07a 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -389,6 +389,10 @@ impl PageServerNode {
                 .remove("image_creation_threshold")
                 .map(|x| x.parse::<usize>())
                 .transpose()?,
+            image_layer_creation_check_threshold: settings
+                .remove("image_layer_creation_check_threshold")
+                .map(|x| x.parse::<u8>())
+                .transpose()?,
             pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
             walreceiver_connect_timeout: settings
                 .remove("walreceiver_connect_timeout")
@@ -501,6 +505,12 @@ impl PageServerNode {
                     .map(|x| x.parse::<usize>())
                     .transpose()
                     .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+                image_layer_creation_check_threshold: settings
+                    .remove("image_layer_creation_check_threshold")
+                    .map(|x| x.parse::<u8>())
+                    .transpose()
+                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
+
                 pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                 walreceiver_connect_timeout: settings
                     .remove("walreceiver_connect_timeout")
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index aad4cc97fc..ad4ca6710d 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -301,6 +301,7 @@ pub struct TenantConfig {
     pub heatmap_period: Option<String>,
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
+    pub image_layer_creation_check_threshold: Option<u8>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 792d9e548d..0806ef0cf4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3653,6 +3653,9 @@ pub(crate) mod harness {
                 heatmap_period: Some(tenant_conf.heatmap_period),
                 lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                 timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
+                image_layer_creation_check_threshold: Some(
+                    tenant_conf.image_layer_creation_check_threshold,
+                ),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 53a8c97e23..a2bb479f63 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -57,6 +57,9 @@ pub mod defaults {
     // throughputs up to 1GiB/s per timeline.
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
     pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+    // By default ingest enough WAL for two new L0 layers before checking if new image
+    // image layers should be created.
+    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
@@ -362,6 +365,10 @@ pub struct TenantConf {
     pub lazy_slru_download: bool,
 
     pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
+
+    // How much WAL must be ingested before checking again whether a new image layer is required.
+    // Expresed in multiples of checkpoint distance.
+    pub image_layer_creation_check_threshold: u8,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_layer_creation_check_threshold: Option<u8>,
 }
 
 impl TenantConfOpt {
@@ -508,6 +518,9 @@ impl TenantConfOpt {
                 .timeline_get_throttle
                 .clone()
                 .unwrap_or(global_conf.timeline_get_throttle),
+            image_layer_creation_check_threshold: self
+                .image_layer_creation_check_threshold
+                .unwrap_or(global_conf.image_layer_creation_check_threshold),
         }
     }
 }
@@ -548,6 +561,7 @@ impl Default for TenantConf {
             heatmap_period: Duration::ZERO,
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
+            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
         }
     }
 }
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             heatmap_period: value.heatmap_period.map(humantime),
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
+            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bc3fc1df1f..f3565c1fb3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -309,6 +309,8 @@ pub struct Timeline {
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
 
+    last_image_layer_creation_check_at: AtomicLsn,
+
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
 
@@ -1632,6 +1634,15 @@ impl Timeline {
             .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
     }
 
+    fn get_image_layer_creation_check_threshold(&self) -> u8 {
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
+        tenant_conf.image_layer_creation_check_threshold.unwrap_or(
+            self.conf
+                .default_tenant_conf
+                .image_layer_creation_check_threshold,
+        )
+    }
+
     pub(super) fn tenant_conf_updated(&self) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
@@ -1769,6 +1780,7 @@ impl Timeline {
                 },
                 partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                 repartition_threshold: 0,
+                last_image_layer_creation_check_at: AtomicLsn::new(0),
 
                 last_received_wal: Mutex::new(None),
                 rel_size_cache: RwLock::new(HashMap::new()),
@@ -1797,6 +1809,7 @@ impl Timeline {
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
+
             result
                 .metrics
                 .last_record_gauge
@@ -3501,6 +3514,24 @@ impl Timeline {
 
     // Is it time to create a new image layer for the given partition?
     async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
+        let last = self.last_image_layer_creation_check_at.load();
+        if lsn != Lsn(0) {
+            let distance = lsn
+                .checked_sub(last)
+                .expect("Attempt to compact with LSN going backwards");
+
+            let min_distance = self.get_image_layer_creation_check_threshold() as u64
+                * self.get_checkpoint_distance();
+
+            // Skip the expensive delta layer counting below if we've not ingested
+            // sufficient WAL since the last check.
+            if distance.0 < min_distance {
+                return false;
+            }
+        }
+
+        self.last_image_layer_creation_check_at.store(lsn);
+
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 3058926b25..909d25980b 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -189,6 +189,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         },
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
+        "image_layer_creation_check_threshold": 1,
     }
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 7bbc0cc160..fefb30bbdd 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -165,6 +165,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
         "compaction_threshold": "3",
         # "image_creation_threshold": set at runtime
         "compaction_target_size": f"{128 * (1024**2)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
+        "image_layer_creation_check_threshold": "0",  # always check if a new image layer can be created
     }
 
     def tenant_update_config(changes):
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index ca4295c5cb..f311a8bf2c 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -53,6 +53,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
         "checkpoint_timeout": "24h",  # something we won't reach
         "checkpoint_distance": f"{50 * (1024**2)}",  # something we won't reach, we checkpoint manually
         "image_creation_threshold": "100",  # we want to control when image is created
+        "image_layer_creation_check_threshold": "0",
         "compaction_threshold": f"{l0_l1_threshold}",
         "compaction_target_size": f"{128 * (1024**3)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
     }
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 914f068afb..ba0d53704b 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -568,6 +568,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
         "image_creation_threshold": 100,
         # repartitioning parameter, unused
         "compaction_target_size": 128 * 1024**2,
+        # Always check if a new image layer can be created
+        "image_layer_creation_check_threshold": 0,
         # pitr_interval and gc_horizon are not interesting because we dont run gc
     }
 
@@ -632,7 +634,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
     # threshold to expose image creation to downloading all of the needed
     # layers -- threshold of 2 would sound more reasonable, but keeping it as 1
     # to be less flaky
-    env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"})
+    conf["image_creation_threshold"] = "1"
+    env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()})
 
     pageserver_http.timeline_compact(tenant_id, timeline_id)
     layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 56b4548b64..41fa03cdf8 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -53,6 +53,7 @@ TENANT_CONF = {
     "compaction_period": "0s",
     # create image layers eagerly, so that GC can remove some layers
     "image_creation_threshold": "1",
+    "image_layer_creation_check_threshold": "0",
 }
 
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 986d6c4dbf..47200a856e 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -245,6 +245,7 @@ def test_remote_storage_upload_queue_retries(
             "compaction_period": "0s",
             # create image layers eagerly, so that GC can remove some layers
             "image_creation_threshold": "1",
+            "image_layer_creation_check_threshold": "0",
         }
     )