From 62d77e263f2b3f4b6847b6a9a14c319da6cfbfa4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 27 Feb 2024 10:55:10 +0100
Subject: [PATCH] test_remote_timeline_client_calls_started_metric: fix
 flakiness (#6911)

fixes https://github.com/neondatabase/neon/issues/6889

# Problem

The failure in the last 3 flaky runs on `main` is

```
test_runner/regress/test_remote_storage.py:460: in test_remote_timeline_client_calls_started_metric
    churn("a", "b")
test_runner/regress/test_remote_storage.py:457: in churn
    assert gc_result["layers_removed"] > 0
E   assert 0 > 0
```

That's this code


https://github.com/neondatabase/neon/blob/cd449d66ea29ad2d7269458e90623c3ae40e1816/test_runner/regress/test_remote_storage.py#L448-L460

So, the test expects GC to remove some layers but the GC doesn't.

# Fix

My impression is that the VACUUM isn't re-using pages aggressively
enough, but I can't really prove that. Tried to analyze the layer map
dump but it's too complex.

So, this PR:

- Creates more churn by doing the overwrite twice.
- Forces image layer creation.

It also drive-by removes the redundant call to timeline_compact,
because, timeline_checkpoint already does that internally.
---
 pageserver/src/http/routes.rs              |  8 ++++++++
 pageserver/src/tenant/timeline.rs          |  8 +++++++-
 test_runner/fixtures/pageserver/http.py    |  6 ++++++
 test_runner/regress/test_remote_storage.py | 16 ++++++++++------
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1339229a70..04211fbb7f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1616,6 +1616,10 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
@@ -1642,6 +1646,10 @@ async fn timeline_checkpoint_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
         flags |= CompactFlags::ForceRepartition;
     }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f09617849c..b14eafa194 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -503,6 +503,7 @@ pub enum GetLogicalSizePriority {
 #[derive(enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
     ForceRepartition,
+    ForceImageLayerCreation,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1157,7 +1158,12 @@ impl Timeline {
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
                 let layers = self
-                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
                     .await
                     .map_err(anyhow::Error::from)?;
                 if let Some(remote_client) = &self.remote_client {
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 427ef00c78..ad3efb5837 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -549,11 +549,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         force_repartition=False,
+        force_image_layer_creation=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
         if force_repartition:
             query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"
 
         log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
@@ -608,11 +611,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         tenant_id: Union[TenantId, TenantShardId],
         timeline_id: TimelineId,
         force_repartition=False,
+        force_image_layer_creation=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
         if force_repartition:
             query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"
 
         log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 176a5e57dc..73ebe0a76f 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -382,6 +382,7 @@ def test_remote_timeline_client_calls_started_metric(
         initial_tenant_conf={
             # small checkpointing and compaction targets to ensure we generate many upload operations
             "checkpoint_distance": f"{128 * 1024}",
+            # ensure each timeline_checkpoint() calls creates L1s
             "compaction_threshold": "1",
             "compaction_target_size": f"{128 * 1024}",
             # no PITR horizon, we specify the horizon when we request on-demand GC
@@ -389,8 +390,6 @@ def test_remote_timeline_client_calls_started_metric(
             # disable background compaction and GC. We invoke it manually when we want it to happen.
             "gc_period": "0s",
             "compaction_period": "0s",
-            # create image layers eagerly, so that GC can remove some layers
-            "image_creation_threshold": "1",
         }
     )
 
@@ -449,12 +448,17 @@ def test_remote_timeline_client_calls_started_metric(
             ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
 
     def churn(data_pass1, data_pass2):
+        # overwrite the same data in place, vacuum inbetween, and
+        # and create image layers; then run a gc().
+        # this should
+        # - create new layers
+        # - delete some layers
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
         overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
         gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
         print_gc_result(gc_result)
         assert gc_result["layers_removed"] > 0