tests: stabilize + extend deletion queue recovery test (#5457)

## Problem

This test was unstable when run in parallel with lots of others: if the
pageserver stayed up long enough for some of the deletions to get
validated, they won't be discarded on restart the way the test expects
when keep_attachment=True. This was a test bug, not a pageserver bug.

## Summary of changes

- Add failpoints to control plane api client
- Use failpoint to pause validation in the test to cover the case where
it had been flaky
- Add a metric for the number of deleted keys validated
- Add a permutation to the test to additionally exercise the case where
we _do_ validate lists before restart: this is a coverage enhancement
that seemed sensible when realizing that the test was relying on nothing
being validated before restart.
- the test will now always enter the restart with nothing or everything
validated.
This commit is contained in:
John Spray
2023-10-05 11:22:05 +01:00
committed by GitHub
parent baa5fa1e77
commit 7cbb39063a
4 changed files with 53 additions and 12 deletions

View File

@@ -133,6 +133,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
node_id: self.node_id,
};
fail::fail_point!("control-plane-client-re-attach");
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
tracing::info!(
"Received re-attach response with {} tenants",
@@ -168,6 +170,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
.collect(),
};
fail::fail_point!("control-plane-client-validate");
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
Ok(response

View File

@@ -220,6 +220,8 @@ where
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
mutated = true;
} else {
metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
}
this_list_valid
});

View File

@@ -967,6 +967,7 @@ pub(crate) struct DeletionQueueMetrics {
pub(crate) keys_submitted: IntCounter,
pub(crate) keys_dropped: IntCounter,
pub(crate) keys_executed: IntCounter,
pub(crate) keys_validated: IntCounter,
pub(crate) dropped_lsn_updates: IntCounter,
pub(crate) unexpected_errors: IntCounter,
pub(crate) remote_errors: IntCounterVec,
@@ -988,7 +989,13 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
keys_executed: register_int_counter!(
"pageserver_deletion_queue_executed_total",
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
)
.expect("failed to define a metric"),
keys_validated: register_int_counter!(
"pageserver_deletion_queue_validated_total",
"Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
)
.expect("failed to define a metric"),

View File

@@ -116,6 +116,10 @@ def get_deletion_queue_submitted(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_submitted_total")
def get_deletion_queue_validated(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_validated_total")
def get_deletion_queue_dropped(ps_http) -> int:
return get_metric_or_0(ps_http, "pageserver_deletion_queue_dropped_total")
@@ -273,12 +277,15 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("keep_attachment", [True, False])
@pytest.mark.parametrize("validate_before", [True, False])
def test_deletion_queue_recovery(
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool, validate_before: bool
):
"""
:param keep_attachment: If true, we re-attach after restart. Else, we act as if some other
node took the attachment while we were restarting.
:param validate_before: If true, we wait for deletions to be validated before restart. This
makes them elegible to be executed after restart, if the same node keeps the attachment.
"""
neon_env_builder.enable_generations = True
neon_env_builder.enable_pageserver_remote_storage(
@@ -288,12 +295,20 @@ def test_deletion_queue_recovery(
ps_http = env.pageserver.http_client()
# Prevent deletion lists from being executed, to build up some backlog of deletions
ps_http.configure_failpoints(
[
("deletion-queue-before-execute", "return"),
]
)
failpoints = [
# Prevent deletion lists from being executed, to build up some backlog of deletions
("deletion-queue-before-execute", "return"),
]
if not validate_before:
failpoints.append(
# Prevent deletion lists from being validated, we will test that they are
# dropped properly during recovery. 'pause' is okay here because we kill
# the pageserver with immediate=true
("control-plane-client-validate", "pause")
)
ps_http.configure_failpoints(failpoints)
generate_uploads_and_deletions(env)
@@ -305,6 +320,16 @@ def test_deletion_queue_recovery(
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
if validate_before:
def assert_validation_complete():
assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
wait_until(20, 1, assert_validation_complete)
# A short wait to let the DeletionHeader get written out, as this happens after
# the validated count gets incremented.
time.sleep(1)
log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
env.pageserver.stop(immediate=True)
@@ -327,14 +352,17 @@ def test_deletion_queue_recovery(
ps_http.deletion_queue_flush(execute=True)
wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
if keep_attachment:
# If we kept the attachment, then our pre-restart deletions should have executed
# successfully
if keep_attachment or validate_before:
# - If we kept the attachment, then our pre-restart deletions should execute
# because on re-attach they were from the immediately preceding generation
# - If we validated before restart, then the deletions should execute because the
# deletion queue header records a validated deletion list sequence number.
assert get_deletion_queue_executed(ps_http) == before_restart_depth
else:
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
# If we lost the attachment, we should have dropped our pre-restart deletions.
assert get_deletion_queue_dropped(ps_http) == before_restart_depth
env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
assert get_deletion_queue_unexpected_errors(ps_http) == 0
assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0