test_remote_timeline_client_calls_started_metric: fix flakiness (#6911)

fixes https://github.com/neondatabase/neon/issues/6889 # Problem The failure in the last 3 flaky runs on `main` is ``` test_runner/regress/test_remote_storage.py:460: in test_remote_timeline_client_calls_started_metric churn("a", "b") test_runner/regress/test_remote_storage.py:457: in churn assert gc_result["layers_removed"] > 0 E assert 0 > 0 ``` That's this code cd449d66ea/test_runner/regress/test_remote_storage.py (L448-L460) So, the test expects GC to remove some layers but the GC doesn't. # Fix My impression is that the VACUUM isn't re-using pages aggressively enough, but I can't really prove that. Tried to analyze the layer map dump but it's too complex. So, this PR: - Creates more churn by doing the overwrite twice. - Forces image layer creation. It also drive-by removes the redundant call to timeline_compact, because, timeline_checkpoint already does that internally.
2026-01-07 13:32:57 +00:00 · 2024-02-27 10:55:10 +01:00
parent b2bbc20311
commit 62d77e263f
4 changed files with 31 additions and 7 deletions
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1616,6 +1616,10 @@ async fn timeline_compact_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
@@ -1642,6 +1646,10 @@ async fn timeline_checkpoint_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
+    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
+        flags |= CompactFlags::ForceImageLayerCreation;
+    }
+
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -503,6 +503,7 @@ pub enum GetLogicalSizePriority {
 #[derive(enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
    ForceRepartition,
+    ForceImageLayerCreation,
 }

 impl std::fmt::Debug for Timeline {
@@ -1157,7 +1158,12 @@ impl Timeline {
                // 3. Create new image layers for partitions that have been modified
                // "enough".
                let layers = self
-                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        flags.contains(CompactFlags::ForceImageLayerCreation),
+                        &image_ctx,
+                    )
                    .await
                    .map_err(anyhow::Error::from)?;
                if let Some(remote_client) = &self.remote_client {
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -549,11 +549,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        force_repartition=False,
+        force_image_layer_creation=False,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
        if force_repartition:
            query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"

        log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
        res = self.put(
@@ -608,11 +611,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        tenant_id: Union[TenantId, TenantShardId],
        timeline_id: TimelineId,
        force_repartition=False,
+        force_image_layer_creation=False,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
        if force_repartition:
            query["force_repartition"] = "true"
+        if force_image_layer_creation:
+            query["force_image_layer_creation"] = "true"

        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
        res = self.put(
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -382,6 +382,7 @@ def test_remote_timeline_client_calls_started_metric(
        initial_tenant_conf={
            # small checkpointing and compaction targets to ensure we generate many upload operations
            "checkpoint_distance": f"{128 * 1024}",
+            # ensure each timeline_checkpoint() calls creates L1s
            "compaction_threshold": "1",
            "compaction_target_size": f"{128 * 1024}",
            # no PITR horizon, we specify the horizon when we request on-demand GC
@@ -389,8 +390,6 @@ def test_remote_timeline_client_calls_started_metric(
            # disable background compaction and GC. We invoke it manually when we want it to happen.
            "gc_period": "0s",
            "compaction_period": "0s",
-            # create image layers eagerly, so that GC can remove some layers
-            "image_creation_threshold": "1",
        }
    )

@@ -449,12 +448,17 @@ def test_remote_timeline_client_calls_started_metric(
            ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"

    def churn(data_pass1, data_pass2):
+        # overwrite the same data in place, vacuum inbetween, and
+        # and create image layers; then run a gc().
+        # this should
+        # - create new layers
+        # - delete some layers
        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
-        client.timeline_checkpoint(tenant_id, timeline_id)
-        client.timeline_compact(tenant_id, timeline_id)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
+        overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
+        client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
        gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
        print_gc_result(gc_result)
        assert gc_result["layers_removed"] > 0