mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
tests: improve stability of test_deletion_queue_recovery (#7325)
## Problem As https://github.com/neondatabase/neon/issues/6092 points out, this test was (ab)using a failpoint!() with 'pause', which was occasionally causing index uploads to get hung on a stuck executor thread, resulting in timeouts waiting for remote_consistent_lsn. That is one of several failure modes, but by far the most frequent. ## Summary of changes - Replace the failpoint! with a `sleep_millis_async`, which is not only async but also supports clean shutdown. - Improve debugging: log the consistent LSN when scheduling an index upload - Tidy: remove an unnecessary checkpoint in the test code, where last_flush_lsn_upload had just been called (this does a checkpoint internally)
This commit is contained in:
@@ -12,7 +12,7 @@ use pageserver_api::{
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use url::Url;
|
||||
use utils::{backoff, generation::Generation, id::NodeId};
|
||||
use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
|
||||
|
||||
use crate::{
|
||||
config::{NodeMetadata, PageServerConf},
|
||||
@@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
.collect(),
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-validate");
|
||||
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(RetryForeverError::ShuttingDown);
|
||||
}
|
||||
|
||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
|
||||
|
||||
@@ -593,14 +593,14 @@ impl RemoteTimelineClient {
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
metadata: TimelineMetadata,
|
||||
) {
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
|
||||
info!(
|
||||
"scheduling metadata upload with {} files ({} changed)",
|
||||
"scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)",
|
||||
upload_queue.latest_files.len(),
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
);
|
||||
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
|
||||
@@ -111,7 +111,6 @@ def generate_uploads_and_deletions(
|
||||
last_flush_lsn_upload(
|
||||
env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
|
||||
)
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# Compaction should generate some GC-elegible layers
|
||||
for i in range(0, 2):
|
||||
@@ -385,9 +384,8 @@ def test_deletion_queue_recovery(
|
||||
if validate_before == ValidateBefore.NO_VALIDATE:
|
||||
failpoints.append(
|
||||
# Prevent deletion lists from being validated, we will test that they are
|
||||
# dropped properly during recovery. 'pause' is okay here because we kill
|
||||
# the pageserver with immediate=true
|
||||
("control-plane-client-validate", "pause")
|
||||
# dropped properly during recovery. This is such a long sleep as to be equivalent to "never"
|
||||
("control-plane-client-validate", "return(3600000)")
|
||||
)
|
||||
|
||||
ps_http.configure_failpoints(failpoints)
|
||||
|
||||
Reference in New Issue
Block a user