tests: make test_scrubber_physical_gc_ancestors more stable (#8453)

## Problem

This test sometimes found that ancestors were getting cleaned up before
it had done any compaction.

Compaction was happening implicitly via Workload.

Example:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8298/10032173390/index.html#testresult/fb04786402f80822/retries

## Summary of changes

- Set upload=False when writing data after shard split, to avoid doing a
checkpoint
- Add a checkpoint_period & explicit wait for uploads so that we ensure
data lands in S3 without doing a checkpoint
This commit is contained in:
John Spray
2024-07-23 12:57:57 +01:00
committed by GitHub
parent 35854928d9
commit 80c8ceacbc
3 changed files with 29 additions and 11 deletions

View File

@@ -1676,6 +1676,10 @@ async fn timeline_checkpoint_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
// By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
@@ -1692,15 +1696,17 @@ async fn timeline_checkpoint_handler(
}
})?;
timeline
.compact(&cancel, flags, &ctx)
.await
.map_err(|e|
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
if compact {
timeline
.compact(&cancel, flags, &ctx)
.await
.map_err(|e|
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
}
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;

View File

@@ -662,6 +662,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
force_repartition=False,
force_image_layer_creation=False,
wait_until_uploaded=False,
compact: Optional[bool] = None,
):
self.is_testing_enabled_or_skip()
query = {}
@@ -672,6 +673,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
if wait_until_uploaded:
query["wait_until_uploaded"] = "true"
if compact is not None:
query["compact"] = "true" if compact else "false"
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",

View File

@@ -209,9 +209,17 @@ def test_scrubber_physical_gc_ancestors(
new_shard_count = 4
assert shard_count is None or new_shard_count > shard_count
shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
env.storage_controller.reconcile_until_idle() # Move shards to their final locations immediately
# Make sure child shards have some layers
workload.write_rows(100)
# Make sure child shards have some layers. Do not force upload, because the test helper calls checkpoint, which
# compacts, and we only want to do tha explicitly later in the test.
workload.write_rows(100, upload=False)
for shard in shards:
ps = env.get_tenant_pageserver(shard)
log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
ps.http_client().timeline_checkpoint(
shard, timeline_id, compact=False, wait_until_uploaded=True
)
# Flush deletion queue so that we don't leave any orphan layers in the parent that will confuse subsequent checks: once
# a shard is split, any layers in its prefix that aren't referenced by a child will be considered GC'able, even