diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index ae66236499..cccc64685b 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -212,7 +212,7 @@ where /// Files ending with this suffix will be ignored and erased /// during recovery as startup. -const TEMP_SUFFIX: &str = ".tmp"; +const TEMP_SUFFIX: &str = "tmp"; #[serde_as] #[derive(Debug, Serialize, Deserialize)] diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 75fc0524b7..e846340373 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -230,6 +230,7 @@ impl ListWriter { let list_name_pattern = Regex::new("(?[a-zA-Z0-9]{16})-(?[a-zA-Z0-9]{2}).list").unwrap(); + let temp_extension = format!(".{TEMP_SUFFIX}"); let header_path = self.conf.deletion_header_path(); let mut seqs: Vec = Vec::new(); while let Some(dentry) = dir.next_entry().await? { @@ -241,7 +242,7 @@ impl ListWriter { continue; } - if dentry_str.ends_with(TEMP_SUFFIX) { + if dentry_str.ends_with(&temp_extension) { info!("Cleaning up temporary file {dentry_str}"); let absolute_path = deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path")); diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 64c471fb92..1b9f48706f 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -10,6 +10,7 @@ of the pageserver are: """ +import enum import re import time from typing import Optional @@ -276,15 +277,28 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert get_deletion_queue_unexpected_errors(ps_http) == 0 -@pytest.mark.parametrize("keep_attachment", [True, False]) -@pytest.mark.parametrize("validate_before", [True, False]) +class KeepAttachment(str, enum.Enum): + KEEP = "keep" + LOSE = "lose" + + +class ValidateBefore(str, enum.Enum): + VALIDATE = "validate" + NO_VALIDATE = "no-validate" + + +@pytest.mark.parametrize("keep_attachment", [KeepAttachment.KEEP, KeepAttachment.LOSE]) +@pytest.mark.parametrize("validate_before", [ValidateBefore.VALIDATE, ValidateBefore.NO_VALIDATE]) def test_deletion_queue_recovery( - neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, keep_attachment: bool, validate_before: bool + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + keep_attachment: KeepAttachment, + validate_before: ValidateBefore, ): """ - :param keep_attachment: If true, we re-attach after restart. Else, we act as if some other + :param keep_attachment: whether to re-attach after restart. Else, we act as if some other node took the attachment while we were restarting. - :param validate_before: If true, we wait for deletions to be validated before restart. This + :param validate_before: whether to wait for deletions to be validated before restart. This makes them elegible to be executed after restart, if the same node keeps the attachment. """ neon_env_builder.enable_generations = True @@ -300,7 +314,7 @@ def test_deletion_queue_recovery( ("deletion-queue-before-execute", "return"), ] - if not validate_before: + if validate_before == ValidateBefore.NO_VALIDATE: failpoints.append( # Prevent deletion lists from being validated, we will test that they are # dropped properly during recovery. 'pause' is okay here because we kill @@ -320,20 +334,25 @@ def test_deletion_queue_recovery( assert get_deletion_queue_unexpected_errors(ps_http) == 0 assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 - if validate_before: + if validate_before == ValidateBefore.VALIDATE: def assert_validation_complete(): assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http) wait_until(20, 1, assert_validation_complete) - # A short wait to let the DeletionHeader get written out, as this happens after - # the validated count gets incremented. - time.sleep(1) + + # The validatated keys statistic advances before the header is written, so we + # also wait to see the header hit the disk: this seems paranoid but the race + # can really happen on a heavily overloaded test machine. + def assert_header_written(): + assert (env.pageserver.workdir / "deletion" / "header-01").exists() + + wait_until(20, 1, assert_header_written) log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued") env.pageserver.stop(immediate=True) - if not keep_attachment: + if keep_attachment == KeepAttachment.LOSE: some_other_pageserver = 101010 assert env.attachment_service is not None env.attachment_service.attach_hook(env.initial_tenant, some_other_pageserver) @@ -352,7 +371,7 @@ def test_deletion_queue_recovery( ps_http.deletion_queue_flush(execute=True) wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) - if keep_attachment or validate_before: + if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE: # - If we kept the attachment, then our pre-restart deletions should execute # because on re-attach they were from the immediately preceding generation # - If we validated before restart, then the deletions should execute because the