fix(test): only test num of L1 layers in compaction smoke test (#9186)

close https://github.com/neondatabase/neon/issues/9160

For whatever reason, pg17's WAL pattern seems different from others,
which triggers some flaky behavior within the compaction smoke test.

## Summary of changes

* Run L0 compaction before proceeding with the read benchmark.
* So that we can ensure the num of L0 layers is 0 and test the
compaction behavior only with L1 layers.

We have a threshold for triggering L0 compaction. In some cases, the
test case did not produce enough L0 layers to do a L0 compaction,
therefore leaving the layer map with 3+ L0 layers above the L1 layers.
This increases the average read depth for the timeline.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2024-10-02 12:42:35 -04:00
committed by GitHub
parent 38a8dcab9f
commit 700885471f
7 changed files with 63 additions and 11 deletions

View File

@@ -586,6 +586,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
timeline_id: TimelineId,
force_repartition=False,
force_image_layer_creation=False,
force_l0_compaction=False,
wait_until_uploaded=False,
enhanced_gc_bottom_most_compaction=False,
):
@@ -595,6 +596,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
query["force_repartition"] = "true"
if force_image_layer_creation:
query["force_image_layer_creation"] = "true"
if force_l0_compaction:
query["force_l0_compaction"] = "true"
if wait_until_uploaded:
query["wait_until_uploaded"] = "true"
if enhanced_gc_bottom_most_compaction:
@@ -701,6 +704,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
timeline_id: TimelineId,
force_repartition=False,
force_image_layer_creation=False,
force_l0_compaction=False,
wait_until_uploaded=False,
compact: Optional[bool] = None,
**kwargs,
@@ -711,6 +715,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
query["force_repartition"] = "true"
if force_image_layer_creation:
query["force_image_layer_creation"] = "true"
if force_l0_compaction:
query["force_l0_compaction"] = "true"
if wait_until_uploaded:
query["wait_until_uploaded"] = "true"

View File

@@ -175,7 +175,9 @@ class Workload:
if upload:
# Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
ps_http.timeline_checkpoint(
tenant_shard_id, self.timeline_id, wait_until_uploaded=True
tenant_shard_id,
self.timeline_id,
wait_until_uploaded=True,
)
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
else:

View File

@@ -63,7 +63,10 @@ page_cache_size=10
log.info(f"Running churn round {i}/{churn_rounds} ...")
workload.churn_rows(row_count, env.pageserver.id)
ps_http.timeline_compact(tenant_id, timeline_id)
# Force L0 compaction to ensure the number of layers is within bounds; we don't want to count L0 layers
# in this benchmark. In other words, this smoke test ensures number of L1 layers are bound.
ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)