tests: improve stability of test_deletion_queue_recovery (#7325)

## Problem

As https://github.com/neondatabase/neon/issues/6092 points out, this
test was (ab)using a failpoint!() with 'pause', which was occasionally
causing index uploads to get hung on a stuck executor thread, resulting
in timeouts waiting for remote_consistent_lsn.

That is one of several failure modes, but by far the most frequent.

## Summary of changes

- Replace the failpoint! with a `sleep_millis_async`, which is not only
async but also supports clean shutdown.
- Improve debugging: log the consistent LSN when scheduling an index
upload
- Tidy: remove an unnecessary checkpoint in the test code, where
last_flush_lsn_upload had just been called (this does a checkpoint
internally)
This commit is contained in:
John Spray
2024-04-05 18:01:31 +01:00
committed by GitHub
parent ec01292b55
commit 534c099b42
3 changed files with 10 additions and 9 deletions

View File

@@ -12,7 +12,7 @@ use pageserver_api::{
use serde::{de::DeserializeOwned, Serialize};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{backoff, generation::Generation, id::NodeId};
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
use crate::{
config::{NodeMetadata, PageServerConf},
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
.collect(),
};
fail::fail_point!("control-plane-client-validate");
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
if self.cancel.is_cancelled() {
return Err(RetryForeverError::ShuttingDown);
}
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

View File

@@ -593,14 +593,14 @@ impl RemoteTimelineClient {
upload_queue: &mut UploadQueueInitialized,
metadata: TimelineMetadata,
) {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
info!(
"scheduling metadata upload with {} files ({} changed)",
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
upload_queue.latest_files.len(),
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,

View File

@@ -111,7 +111,6 @@ def generate_uploads_and_deletions(
last_flush_lsn_upload(
env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
# Compaction should generate some GC-elegible layers
for i in range(0, 2):
@@ -385,9 +384,8 @@ def test_deletion_queue_recovery(
if validate_before == ValidateBefore.NO_VALIDATE:
failpoints.append(
# Prevent deletion lists from being validated, we will test that they are
# dropped properly during recovery. 'pause' is okay here because we kill
# the pageserver with immediate=true
("control-plane-client-validate", "pause")
# dropped properly during recovery. This is such a long sleep as to be equivalent to "never"
("control-plane-client-validate", "return(3600000)")
)
ps_http.configure_failpoints(failpoints)