tests: make test_scrubber_physical_gc_ancestors more stable (#8453)

## Problem This test sometimes found that ancestors were getting cleaned up before it had done any compaction. Compaction was happening implicitly via Workload. Example: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8298/10032173390/index.html#testresult/fb04786402f80822/retries ## Summary of changes - Set upload=False when writing data after shard split, to avoid doing a checkpoint - Add a checkpoint_period & explicit wait for uploads so that we ensure data lands in S3 without doing a checkpoint
2026-06-01 20:40:37 +00:00 · 2024-07-23 12:57:57 +01:00
parent 35854928d9
commit 80c8ceacbc
3 changed files with 29 additions and 11 deletions
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1676,6 +1676,10 @@ async fn timeline_checkpoint_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
+
+    // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
+    let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
+
    let wait_until_uploaded =
        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

@@ -1692,15 +1696,17 @@ async fn timeline_checkpoint_handler(

                }
            })?;
-        timeline
-            .compact(&cancel, flags, &ctx)
-            .await
-            .map_err(|e|
-                match e {
-                    CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                    CompactionError::Other(e) => ApiError::InternalServerError(e)
-                }
-            )?;
+        if compact {
+            timeline
+                .compact(&cancel, flags, &ctx)
+                .await
+                .map_err(|e|
+                    match e {
+                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                        CompactionError::Other(e) => ApiError::InternalServerError(e)
+                    }
+                )?;
+        }

        if wait_until_uploaded {
            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -662,6 +662,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        force_repartition=False,
        force_image_layer_creation=False,
        wait_until_uploaded=False,
+        compact: Optional[bool] = None,
    ):
        self.is_testing_enabled_or_skip()
        query = {}
@@ -672,6 +673,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
        if wait_until_uploaded:
            query["wait_until_uploaded"] = "true"

+        if compact is not None:
+            query["compact"] = "true" if compact else "false"
+
        log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
        res = self.put(
            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -209,9 +209,17 @@ def test_scrubber_physical_gc_ancestors(
    new_shard_count = 4
    assert shard_count is None or new_shard_count > shard_count
    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
+    env.storage_controller.reconcile_until_idle()  # Move shards to their final locations immediately

-    # Make sure child shards have some layers
-    workload.write_rows(100)
+    # Make sure child shards have some layers.  Do not force upload, because the test helper calls checkpoint, which
+    # compacts, and we only want to do tha explicitly later in the test.
+    workload.write_rows(100, upload=False)
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
+        ps.http_client().timeline_checkpoint(
+            shard, timeline_id, compact=False, wait_until_uploaded=True
+        )

    # Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once
    # a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even