From 9b98823d615c991422b6edd3ec3197192f763cf2 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 25 Jun 2024 19:00:14 -0400
Subject: [PATCH] bottom-most-compaction: use in test_gc_feedback + fix bugs
 (#8103)

Adds manual compaction trigger; add gc compaction to test_gc_feedback

Part of https://github.com/neondatabase/neon/issues/8002

```
test_gc_feedback[debug-pg15].logical_size: 50 Mb
test_gc_feedback[debug-pg15].physical_size: 2269 Mb
test_gc_feedback[debug-pg15].physical/logical ratio: 44.5302
test_gc_feedback[debug-pg15].max_total_num_of_deltas: 7
test_gc_feedback[debug-pg15].max_num_of_deltas_above_image: 2
test_gc_feedback[debug-pg15].logical_size_after_bottom_most_compaction: 50 Mb
test_gc_feedback[debug-pg15].physical_size_after_bottom_most_compaction: 287 Mb
test_gc_feedback[debug-pg15].physical/logical ratio after bottom_most_compaction: 5.6312
test_gc_feedback[debug-pg15].max_total_num_of_deltas_after_bottom_most_compaction: 4
test_gc_feedback[debug-pg15].max_num_of_deltas_above_image_after_bottom_most_compaction: 1
```

## Summary of changes

* Add the manual compaction trigger
* Use in test_gc_feedback
* Add a guard to avoid running it with retain_lsns
* Fix: Do `schedule_compaction_update` after compaction
* Fix: Supply deltas in the correct order to reconstruct value

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                 |  8 ++++
 .../src/tenant/storage_layer/delta_layer.rs   |  1 -
 .../src/tenant/storage_layer/image_layer.rs   |  1 -
 pageserver/src/tenant/storage_layer/layer.rs  |  2 -
 pageserver/src/tenant/timeline.rs             |  2 +-
 pageserver/src/tenant/timeline/compaction.rs  | 41 ++++++++++++----
 .../src/tenant/timeline/layer_manager.rs      |  1 -
 test_runner/fixtures/pageserver/http.py       |  3 ++
 test_runner/performance/test_gc_feedback.py   | 48 ++++++++++++++++++-
 9 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d6ba9ee35e..41d096d7bb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1652,6 +1652,14 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
+        if !cfg!(feature = "testing") {
+            return Err(ApiError::InternalServerError(anyhow!(
+                "enhanced_gc_bottom_most_compaction is only available in testing mode"
+            )));
+        }
+        flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+    }
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ab3ef4980f..bf5d9249eb 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -928,7 +928,6 @@ impl DeltaLayerInner {
     }
 
     /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    #[cfg(test)]
     pub(super) async fn load_key_values(
         &self,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 99bce1890d..50aacbd9ad 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -486,7 +486,6 @@ impl ImageLayerInner {
     }
 
     /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
-    #[cfg(test)]
     pub(super) async fn load_key_values(
         &self,
         ctx: &RequestContext,
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index d856909f2e..7eb42d8186 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -389,7 +389,6 @@ impl Layer {
     }
 
     /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
-    #[cfg(test)]
     pub(crate) async fn load_key_values(
         &self,
         ctx: &RequestContext,
@@ -1774,7 +1773,6 @@ impl DownloadedLayer {
         }
     }
 
-    #[cfg(test)]
     async fn load_key_values(
         &self,
         owner: &Arc<LayerInner>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5398ad399c..1175b75017 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -686,6 +686,7 @@ pub enum GetLogicalSizePriority {
 pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
+    EnhancedGcBottomMostCompaction,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1096,7 +1097,6 @@ impl Timeline {
     /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored
     /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that
     /// the scan operation will not cause OOM in the future.
-    #[allow(dead_code)]
     pub(crate) async fn scan(
         &self,
         keyspace: KeySpace,
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index de1263fadf..efaa6144af 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -47,10 +47,14 @@ impl Timeline {
     /// TODO: cancellation
     pub(crate) async fn compact_legacy(
         self: &Arc<Self>,
-        _cancel: &CancellationToken,
+        cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
+        if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
+            return self.compact_with_gc(cancel, ctx).await;
+        }
+
         // High level strategy for compaction / image creation:
         //
         // 1. First, calculate the desired "partitioning" of the
@@ -959,15 +963,20 @@ impl Timeline {
     /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
     /// and create delta layers with all deltas >= gc horizon.
-    #[cfg(test)]
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
         _cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
+        use crate::tenant::storage_layer::ValueReconstructState;
         use std::collections::BTreeSet;
 
-        use crate::tenant::storage_layer::ValueReconstructState;
+        info!("running enhanced gc bottom-most compaction");
+
+        scopeguard::defer! {
+            info!("done enhanced gc bottom-most compaction");
+        };
+
         // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
@@ -976,6 +985,11 @@ impl Timeline {
             let guard = self.layers.read().await;
             let layers = guard.layer_map();
             let gc_info = self.gc_info.read().unwrap();
+            if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() {
+                return Err(CompactionError::Other(anyhow!(
+                    "enhanced legacy compaction currently does not support retain_lsns (branches)"
+                )));
+            }
             let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
             let mut selected_layers = Vec::new();
             // TODO: consider retain_lsns
@@ -987,6 +1001,11 @@ impl Timeline {
             }
             (selected_layers, gc_cutoff)
         };
+        info!(
+            "picked {} layers for compaction with gc_cutoff={}",
+            layer_selection.len(),
+            gc_cutoff
+        );
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, collect the layer information to decide when to split the new delta layers.
         let mut all_key_values = Vec::new();
@@ -1064,10 +1083,8 @@ impl Timeline {
                 } else if *lsn <= horizon {
                     match val {
                         crate::repository::Value::Image(image) => {
-                            if lsn <= &horizon {
-                                base_image = Some((*lsn, image.clone()));
-                                break;
-                            }
+                            base_image = Some((*lsn, image.clone()));
+                            break;
                         }
                         crate::repository::Value::WalRecord(wal) => {
                             delta_above_base_image.push((*lsn, wal.clone()));
@@ -1075,7 +1092,7 @@ impl Timeline {
                     }
                 }
             }
-            delta_above_base_image.reverse();
+            // do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records
             keys_above_horizon.reverse();
             let state = ValueReconstructState {
                 img: base_image,
@@ -1200,6 +1217,11 @@ impl Timeline {
         );
 
         let image_layer = image_layer_writer.finish(self, ctx).await?;
+        info!(
+            "produced {} delta layers and {} image layers",
+            delta_layers.len(),
+            1
+        );
         let mut compact_to = Vec::new();
         compact_to.extend(delta_layers);
         compact_to.push(image_layer);
@@ -1208,6 +1230,9 @@ impl Timeline {
             let mut guard = self.layers.write().await;
             guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
         };
+
+        self.remote_client
+            .schedule_compaction_update(&layer_selection, &compact_to)?;
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 550a9a567a..948237e06a 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -227,7 +227,6 @@ impl LayerManager {
     }
 
     /// Called when a GC-compaction is completed.
-    #[cfg(test)]
     pub(crate) fn finish_gc_compaction(
         &mut self,
         compact_from: &[Layer],
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 2a7cbea200..7949612714 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -573,6 +573,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         force_repartition=False,
         force_image_layer_creation=False,
         wait_until_uploaded=False,
+        enhanced_gc_bottom_most_compaction=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -582,6 +583,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_image_layer_creation"] = "true"
         if wait_until_uploaded:
             query["wait_until_uploaded"] = "true"
+        if enhanced_gc_bottom_most_compaction:
+            query["enhanced_gc_bottom_most_compaction"] = "true"
 
         log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py
index 9a03994b29..4c326111c2 100644
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -33,7 +33,7 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
             "checkpoint_distance": f"{1024 ** 2}",
             "compaction_target_size": f"{1024 ** 2}",
             # set PITR interval to be small, so we can do GC
-            "pitr_interval": "10 s",
+            "pitr_interval": "60 s",
             # "compaction_threshold": "3",
             # "image_creation_threshold": "2",
         }
@@ -99,6 +99,52 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
         MetricReport.LOWER_IS_BETTER,
     )
 
+    client.timeline_compact(tenant_id, timeline_id, enhanced_gc_bottom_most_compaction=True)
+    tline_detail = client.timeline_detail(tenant_id, timeline_id)
+    logical_size = tline_detail["current_logical_size"]
+    physical_size = tline_detail["current_physical_size"]
+
+    max_num_of_deltas_above_image = 0
+    max_total_num_of_deltas = 0
+    for key_range in client.perf_info(tenant_id, timeline_id):
+        max_total_num_of_deltas = max(max_total_num_of_deltas, key_range["total_num_of_deltas"])
+        max_num_of_deltas_above_image = max(
+            max_num_of_deltas_above_image, key_range["num_of_deltas_above_image"]
+        )
+    zenbenchmark.record(
+        "logical_size_after_bottom_most_compaction",
+        logical_size // MB,
+        "Mb",
+        MetricReport.LOWER_IS_BETTER,
+    )
+    zenbenchmark.record(
+        "physical_size_after_bottom_most_compaction",
+        physical_size // MB,
+        "Mb",
+        MetricReport.LOWER_IS_BETTER,
+    )
+    zenbenchmark.record(
+        "physical/logical ratio after bottom_most_compaction",
+        physical_size / logical_size,
+        "",
+        MetricReport.LOWER_IS_BETTER,
+    )
+    zenbenchmark.record(
+        "max_total_num_of_deltas_after_bottom_most_compaction",
+        max_total_num_of_deltas,
+        "",
+        MetricReport.LOWER_IS_BETTER,
+    )
+    zenbenchmark.record(
+        "max_num_of_deltas_above_image_after_bottom_most_compaction",
+        max_num_of_deltas_above_image,
+        "",
+        MetricReport.LOWER_IS_BETTER,
+    )
+
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT * FROM t")  # ensure data is not corrupted
+
     layer_map_path = env.repo_dir / "layer-map.json"
     log.info(f"Writing layer map to {layer_map_path}")
     with layer_map_path.open("w") as f: