mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
tests: stabilize + extend deletion queue recovery test (#5457)
## Problem This test was unstable when run in parallel with lots of others: if the pageserver stayed up long enough for some of the deletions to get validated, they won't be discarded on restart the way the test expects when keep_attachment=True. This was a test bug, not a pageserver bug. ## Summary of changes - Add failpoints to control plane api client - Use failpoint to pause validation in the test to cover the case where it had been flaky - Add a metric for the number of deleted keys validated - Add a permutation to the test to additionally exercise the case where we _do_ validate lists before restart: this is a coverage enhancement that seemed sensible when realizing that the test was relying on nothing being validated before restart. - the test will now always enter the restart with nothing or everything validated.
This commit is contained in:
@@ -133,6 +133,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
node_id: self.node_id,
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-re-attach");
|
||||
|
||||
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
tracing::info!(
|
||||
"Received re-attach response with {} tenants",
|
||||
@@ -168,6 +170,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
.collect(),
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-validate");
|
||||
|
||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
|
||||
Ok(response
|
||||
|
||||
@@ -220,6 +220,8 @@ where
|
||||
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
|
||||
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
|
||||
mutated = true;
|
||||
} else {
|
||||
metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
|
||||
}
|
||||
this_list_valid
|
||||
});
|
||||
|
||||
@@ -967,6 +967,7 @@ pub(crate) struct DeletionQueueMetrics {
|
||||
pub(crate) keys_submitted: IntCounter,
|
||||
pub(crate) keys_dropped: IntCounter,
|
||||
pub(crate) keys_executed: IntCounter,
|
||||
pub(crate) keys_validated: IntCounter,
|
||||
pub(crate) dropped_lsn_updates: IntCounter,
|
||||
pub(crate) unexpected_errors: IntCounter,
|
||||
pub(crate) remote_errors: IntCounterVec,
|
||||
@@ -988,7 +989,13 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
||||
|
||||
keys_executed: register_int_counter!(
|
||||
"pageserver_deletion_queue_executed_total",
|
||||
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
|
||||
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
keys_validated: register_int_counter!(
|
||||
"pageserver_deletion_queue_validated_total",
|
||||
"Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
|
||||
@@ -116,6 +116,10 @@ def get_deletion_queue_submitted(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
|
||||
|
||||
|
||||
def get_deletion_queue_validated(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_validated_total")
|
||||
|
||||
|
||||
def get_deletion_queue_dropped(ps_http) -> int:
|
||||
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
|
||||
|
||||
@@ -273,12 +277,15 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_attachment", [True, False])
|
||||
@pytest.mark.parametrize("validate_before", [True, False])
|
||||
def test_deletion_queue_recovery(
|
||||
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
|
||||
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool, validate_before: bool
|
||||
):
|
||||
"""
|
||||
:param keep_attachment: If true, we re-attach after restart. Else, we act as if some other
|
||||
node took the attachment while we were restarting.
|
||||
:param validate_before: If true, we wait for deletions to be validated before restart. This
|
||||
makes them elegible to be executed after restart, if the same node keeps the attachment.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
@@ -288,12 +295,20 @@ def test_deletion_queue_recovery(
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# Prevent deletion lists from being executed, to build up some backlog of deletions
|
||||
ps_http.configure_failpoints(
|
||||
[
|
||||
("deletion-queue-before-execute", "return"),
|
||||
]
|
||||
)
|
||||
failpoints = [
|
||||
# Prevent deletion lists from being executed, to build up some backlog of deletions
|
||||
("deletion-queue-before-execute", "return"),
|
||||
]
|
||||
|
||||
if not validate_before:
|
||||
failpoints.append(
|
||||
# Prevent deletion lists from being validated, we will test that they are
|
||||
# dropped properly during recovery. 'pause' is okay here because we kill
|
||||
# the pageserver with immediate=true
|
||||
("control-plane-client-validate", "pause")
|
||||
)
|
||||
|
||||
ps_http.configure_failpoints(failpoints)
|
||||
|
||||
generate_uploads_and_deletions(env)
|
||||
|
||||
@@ -305,6 +320,16 @@ def test_deletion_queue_recovery(
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
|
||||
if validate_before:
|
||||
|
||||
def assert_validation_complete():
|
||||
assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
|
||||
|
||||
wait_until(20, 1, assert_validation_complete)
|
||||
# A short wait to let the DeletionHeader get written out, as this happens after
|
||||
# the validated count gets incremented.
|
||||
time.sleep(1)
|
||||
|
||||
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
@@ -327,14 +352,17 @@ def test_deletion_queue_recovery(
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
|
||||
|
||||
if keep_attachment:
|
||||
# If we kept the attachment, then our pre-restart deletions should have executed
|
||||
# successfully
|
||||
if keep_attachment or validate_before:
|
||||
# - If we kept the attachment, then our pre-restart deletions should execute
|
||||
# because on re-attach they were from the immediately preceding generation
|
||||
# - If we validated before restart, then the deletions should execute because the
|
||||
# deletion queue header records a validated deletion list sequence number.
|
||||
assert get_deletion_queue_executed(ps_http) == before_restart_depth
|
||||
else:
|
||||
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
|
||||
|
||||
# If we lost the attachment, we should have dropped our pre-restart deletions.
|
||||
assert get_deletion_queue_dropped(ps_http) == before_restart_depth
|
||||
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(ps_http) == 0
|
||||
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
|
||||
|
||||
Reference in New Issue
Block a user