tests: refine test_secondary_background_downloads (#7829)

## Problem

This test relied on some sleeps, and was failing ~5% of the time.

## Summary of changes

Use log-watching rather than straight waits, and make timeouts more
generous for the CI environment.
This commit is contained in:
John Spray
2024-05-22 19:17:47 +01:00
committed by GitHub
parent ddd8ebd253
commit 014f822a78

View File

@@ -578,7 +578,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
default_download_period_secs = 60
# The upload period, which will also be the download once the secondary has seen its first heatmap
upload_period_secs = 20
upload_period_secs = 30
for _i in range(0, tenant_count):
tenant_id = TenantId.generate()
@@ -596,11 +596,26 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
tenant_timelines[tenant_id] = [timeline_a, timeline_b]
def await_log(pageserver, deadline, expression):
"""
Wrapper around assert_log_contains that waits with a deadline rather than timeout
"""
now = time.time()
if now > deadline:
raise RuntimeError(f"Timed out waiting for {expression}")
else:
timeout = int(deadline - now) + 1
try:
wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore
except:
log.error(f"Timed out waiting for '{expression}'")
raise
t_start = time.time()
# Wait long enough that the background downloads should happen; we expect all the inital layers
# of all the initial timelines to show up on the secondary location of each tenant.
time.sleep(default_download_period_secs * 1.5)
initial_download_deadline = time.time() + default_download_period_secs * 3
for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -608,8 +623,24 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
now = time.time()
if now > initial_download_deadline:
raise RuntimeError("Timed out waiting for initial secondary download")
else:
for timeline_id in timelines:
log.info(
f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}"
)
await_log(
ps_secondary,
initial_download_deadline,
f".*{timeline_id}.*Wrote timeline_detail.*",
)
for timeline_id in timelines:
log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
log.info(
f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}"
)
# One or more layers should be present for all timelines
assert ps_secondary.list_layers(tenant_id, timeline_id)
@@ -617,7 +648,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
# Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
time.sleep(upload_period_secs * 2.5)
deletion_deadline = time.time() + upload_period_secs * 3
for tenant_id, timelines in tenant_timelines.items():
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -625,6 +656,16 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
# We only have two: the other one must be secondary
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
expect_del_timeline = timelines[1]
log.info(
f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}"
)
await_log(
ps_secondary,
deletion_deadline,
f".*Timeline no longer in heatmap.*{expect_del_timeline}.*",
)
# This one was not deleted
assert ps_secondary.list_layers(tenant_id, timelines[0])
@@ -632,7 +673,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
log.info(
f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
)
assert not ps_secondary.list_layers(tenant_id, timelines[1])
assert not ps_secondary.list_layers(tenant_id, expect_del_timeline)
t_end = time.time()