test(pageserver): fix test_pageserver_gc_compaction_idempotent (#10833)

## Problem ref https://github.com/neondatabase/neon/issues/10517 ## Summary of changes For some reasons the job split algorithm decides to have different image coverage range for two compactions before/after restart. So we remove the subcompaction key range and let it generate an image covering the full range, which should make the test more stable. Also slightly tuned the logging span. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
2026-01-05 20:42:54 +00:00 · 2025-02-18 11:06:20 -05:00
parent f36ec5c84b
commit f9a063e2e9
3 changed files with 10 additions and 25 deletions
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3101,6 +3101,9 @@ impl Tenant {
                if let Some(queue) = queue {
                    outcome = queue
                        .iteration(cancel, ctx, &self.gc_block, &timeline)
+                        .instrument(
+                            info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
+                        )
                        .await?;
                }
            }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -301,18 +301,12 @@ impl GcCompactionQueue {
                        let mut guard = self.inner.lock().unwrap();
                        guard.gc_guards.insert(id, gc_guard);
                    }
-                    let _ = timeline
-                        .compact_with_options(cancel, options, ctx)
-                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
-                        .await?;
+                    let _ = timeline.compact_with_options(cancel, options, ctx).await?;
                    self.notify_and_unblock(id);
                }
            }
            GcCompactionQueueItem::SubCompactionJob(options) => {
-                let _ = timeline
-                    .compact_with_options(cancel, options, ctx)
-                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
-                    .await?;
+                let _ = timeline.compact_with_options(cancel, options, ctx).await?;
            }
            GcCompactionQueueItem::Notify(id) => {
                self.notify_and_unblock(id);
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -236,9 +236,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
    wait_until(compaction_finished, timeout=60)

    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")

    log.info("Validating at workload end ...")
    workload.validate(env.pageserver.id)
@@ -300,6 +298,8 @@ def test_pageserver_gc_compaction_idempotent(
    workload.churn_rows(row_count, env.pageserver.id)
    env.create_branch("child_branch")  # so that we have a retain_lsn
    workload.churn_rows(row_count, env.pageserver.id)
+    env.create_branch("child_branch_2")  # so that we have another retain_lsn
+    workload.churn_rows(row_count, env.pageserver.id)
    # compact 3 times if mode is before_restart
    n_compactions = 3 if compaction_mode == "before_restart" else 1
    ps_http.timeline_compact(
@@ -315,10 +315,6 @@ def test_pageserver_gc_compaction_idempotent(
            body={
                "scheduled": True,
                "sub_compaction": True,
-                "compact_key_range": {
-                    "start": "000000000000000000000000000000000000",
-                    "end": "030000000000000000000000000000000000",
-                },
                "sub_compaction_max_job_size_mb": 16,
            },
        )
@@ -336,19 +332,13 @@ def test_pageserver_gc_compaction_idempotent(
                body={
                    "scheduled": True,
                    "sub_compaction": True,
-                    "compact_key_range": {
-                        "start": "000000000000000000000000000000000000",
-                        "end": "030000000000000000000000000000000000",
-                    },
                    "sub_compaction_max_job_size_mb": 16,
                },
            )
            wait_until(compaction_finished, timeout=60)

    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")

    # ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively,
    # and the second one should have hit the duplicated layer key warning.
@@ -466,9 +456,7 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder):
    wait_until(compaction_finished, timeout=60)

    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
-    env.pageserver.assert_log_contains(
-        "scheduled_compact_timeline.*picked .* layers for compaction"
-    )
+    env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")

    log.info("Validating at workload end ...")
    workload.validate(env.pageserver.id)