tests: further stabilize test_deletion_queue_recovery (#7335)

This is the other main failure mode called out in #6092 , that the test
can shut down the pageserver while it has "future layers" in the index,
and that this results in unexpected stats after restart.

We can avoid this nondeterminism by shutting down the endpoint, flushing
everything from SK to PS, checkpointing, and then waiting for that final
LSN to be uploaded. This is more heavyweight than most of our tests
require, but useful in the case of tests that expect a particular
behavior after restart wrt layer deletions.
This commit is contained in:
John Spray
2024-04-07 22:21:18 +01:00
committed by GitHub
parent 74b2314a5d
commit 0788760451
2 changed files with 21 additions and 1 deletions

View File

@@ -22,6 +22,7 @@ from fixtures.neon_fixtures import (
NeonPageserver,
PgBin,
S3Scrubber,
flush_ep_to_pageserver,
last_flush_lsn_upload,
)
from fixtures.pageserver.http import PageserverApiException
@@ -30,6 +31,7 @@ from fixtures.pageserver.utils import (
list_prefix,
wait_for_last_record_lsn,
wait_for_upload,
wait_for_upload_queue_empty,
)
from fixtures.remote_storage import (
RemoteStorageKind,
@@ -120,6 +122,17 @@ def generate_uploads_and_deletions(
print_gc_result(gc_result)
assert gc_result["layers_removed"] > 0
# Stop endpoint and flush all data to pageserver, then checkpoint it: this
# ensures that the pageserver is in a fully idle state: there will be no more
# background ingest, no more uploads pending, and therefore no non-determinism
# in subsequent actions like pageserver restarts.
final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
# Finish uploads
wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
# Finish all remote writes (including deletions)
wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
def read_all(
env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None

View File

@@ -1187,7 +1187,14 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"])
assert "Pause" in storcon_cli(["nodes"])[3]
# Make a node offline
# We will simulate a node death and then marking it offline
env.pageservers[0].stop(immediate=True)
# Sleep to make it unlikely that the controller's heartbeater will race handling
# a /utilization response internally, such that it marks the node back online. IRL
# there would always be a longer delay than this before a node failing and a human
# intervening.
time.sleep(2)
storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"])
assert "Offline" in storcon_cli(["nodes"])[3]