feat(pageserver): continue from last incomplete image layer creation (#10660)

## Problem

close https://github.com/neondatabase/neon/issues/10651

## Summary of changes

* Image layer creation starts from the next partition of the last
processed partition if the previous attempt was not complete.
* Add tests.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-02-05 12:35:39 -05:00
committed by GitHub
parent fba22a7123
commit 133b89a83d
3 changed files with 129 additions and 21 deletions

View File

@@ -29,6 +29,21 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
# "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
}
PREEMPT_COMPACTION_TENANT_CONF = {
"gc_period": "5s",
"compaction_period": "5s",
# Small checkpoint distance to create many layers
"checkpoint_distance": 1024**2,
# Compact small layers
"compaction_target_size": 1024**2,
"image_creation_threshold": 1,
"image_creation_preempt_threshold": 1,
# compact more frequently
"compaction_threshold": 3,
"compaction_upper_limit": 6,
"lsn_lease_length": "0s",
}
@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize(
@@ -36,7 +51,8 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
)
def test_pageserver_compaction_smoke(
neon_env_builder: NeonEnvBuilder, wal_receiver_protocol: PageserverWalReceiverProtocol
neon_env_builder: NeonEnvBuilder,
wal_receiver_protocol: PageserverWalReceiverProtocol,
):
"""
This is a smoke test that compaction kicks in. The workload repeatedly churns
@@ -54,7 +70,8 @@ def test_pageserver_compaction_smoke(
page_cache_size=10
"""
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
conf = AGGRESSIVE_COMPACTION_TENANT_CONF.copy()
env = neon_env_builder.init_start(initial_tenant_conf=conf)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -113,6 +130,41 @@ page_cache_size=10
assert vectored_average < 8
@skip_in_debug_build("only run with release build")
def test_pageserver_compaction_preempt(
neon_env_builder: NeonEnvBuilder,
):
# Ideally we should be able to do unit tests for this, but we need real Postgres
# WALs in order to do unit testing...
conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
env = neon_env_builder.init_start(initial_tenant_conf=conf)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
row_count = 200000
churn_rounds = 10
ps_http = env.pageserver.http_client()
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageserver.id)
log.info("Writing initial data ...")
workload.write_rows(row_count, env.pageserver.id)
for i in range(1, churn_rounds + 1):
log.info(f"Running churn round {i}/{churn_rounds} ...")
workload.churn_rows(row_count, env.pageserver.id, upload=False)
workload.validate(env.pageserver.id)
ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)
# ensure image layer creation gets preempted and then resumed
env.pageserver.assert_log_contains("resuming image layer creation")
@skip_in_debug_build("only run with release build")
@pytest.mark.parametrize(
"with_branches",