test(pageserver): fix test_pageserver_gc_compaction_idempotent (#10833)

## Problem

ref https://github.com/neondatabase/neon/issues/10517

## Summary of changes

For some reasons the job split algorithm decides to have different image
coverage range for two compactions before/after restart. So we remove
the subcompaction key range and let it generate an image covering the
full range, which should make the test more stable.

Also slightly tuned the logging span.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-02-18 11:06:20 -05:00
committed by GitHub
parent f36ec5c84b
commit f9a063e2e9
3 changed files with 10 additions and 25 deletions

View File

@@ -3101,6 +3101,9 @@ impl Tenant {
if let Some(queue) = queue {
outcome = queue
.iteration(cancel, ctx, &self.gc_block, &timeline)
.instrument(
info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
)
.await?;
}
}

View File

@@ -301,18 +301,12 @@ impl GcCompactionQueue {
let mut guard = self.inner.lock().unwrap();
guard.gc_guards.insert(id, gc_guard);
}
let _ = timeline
.compact_with_options(cancel, options, ctx)
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
.await?;
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
self.notify_and_unblock(id);
}
}
GcCompactionQueueItem::SubCompactionJob(options) => {
let _ = timeline
.compact_with_options(cancel, options, ctx)
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
.await?;
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
}
GcCompactionQueueItem::Notify(id) => {
self.notify_and_unblock(id);

View File

@@ -236,9 +236,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_b
wait_until(compaction_finished, timeout=60)
# ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
env.pageserver.assert_log_contains(
"scheduled_compact_timeline.*picked .* layers for compaction"
)
env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)
@@ -300,6 +298,8 @@ def test_pageserver_gc_compaction_idempotent(
workload.churn_rows(row_count, env.pageserver.id)
env.create_branch("child_branch") # so that we have a retain_lsn
workload.churn_rows(row_count, env.pageserver.id)
env.create_branch("child_branch_2") # so that we have another retain_lsn
workload.churn_rows(row_count, env.pageserver.id)
# compact 3 times if mode is before_restart
n_compactions = 3 if compaction_mode == "before_restart" else 1
ps_http.timeline_compact(
@@ -315,10 +315,6 @@ def test_pageserver_gc_compaction_idempotent(
body={
"scheduled": True,
"sub_compaction": True,
"compact_key_range": {
"start": "000000000000000000000000000000000000",
"end": "030000000000000000000000000000000000",
},
"sub_compaction_max_job_size_mb": 16,
},
)
@@ -336,19 +332,13 @@ def test_pageserver_gc_compaction_idempotent(
body={
"scheduled": True,
"sub_compaction": True,
"compact_key_range": {
"start": "000000000000000000000000000000000000",
"end": "030000000000000000000000000000000000",
},
"sub_compaction_max_job_size_mb": 16,
},
)
wait_until(compaction_finished, timeout=60)
# ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
env.pageserver.assert_log_contains(
"scheduled_compact_timeline.*picked .* layers for compaction"
)
env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
# ensure we hit the duplicated layer key warning at least once: we did two compactions consecutively,
# and the second one should have hit the duplicated layer key warning.
@@ -466,9 +456,7 @@ def test_pageserver_gc_compaction_interrupt(neon_env_builder: NeonEnvBuilder):
wait_until(compaction_finished, timeout=60)
# ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
env.pageserver.assert_log_contains(
"scheduled_compact_timeline.*picked .* layers for compaction"
)
env.pageserver.assert_log_contains("gc_compact_timeline.*picked .* layers for compaction")
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)