From 89ee8f202835a3889dfb327bd275f6df9ebd5cc4 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 31 Jul 2023 10:43:12 +0300 Subject: [PATCH] fix: demote warnings, fix flakyness (#4837) `WARN ... found future (image|delta) layer` are not actionable log lines. They don't need to be warnings. `info!` is enough. This also fixes some known but not tracked flakyness in [`test_remote_timeline_client_calls_started_metric`][evidence]. [evidence]: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4829/5683495367/index.html#/testresult/34fe79e24729618b Closes #3369. Closes #4473. --- pageserver/src/tenant/timeline.rs | 8 ++++---- test_runner/regress/test_gc_cutoff.py | 4 ---- test_runner/regress/test_pageserver_restart.py | 4 ---- test_runner/regress/test_recovery.py | 4 ---- test_runner/regress/test_remote_storage.py | 3 --- 5 files changed, 4 insertions(+), 19 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4f9c0b08b8..c7974e9c00 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1600,7 +1600,7 @@ impl Timeline { if let Some(imgfilename) = ImageFileName::parse_str(&fname) { // create an ImageLayer struct for each image file. if imgfilename.lsn > disk_consistent_lsn { - warn!( + info!( "found future image layer {} on timeline {} disk_consistent_lsn is {}", imgfilename, self.timeline_id, disk_consistent_lsn ); @@ -1632,7 +1632,7 @@ impl Timeline { // is 102, then it might not have been fully flushed to disk // before crash. if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { - warn!( + info!( "found future delta layer {} on timeline {} disk_consistent_lsn is {}", deltafilename, self.timeline_id, disk_consistent_lsn ); @@ -1774,7 +1774,7 @@ impl Timeline { match remote_layer_name { LayerFileName::Image(imgfilename) => { if imgfilename.lsn > up_to_date_disk_consistent_lsn { - warn!( + info!( "found future image layer {} on timeline {} remote_consistent_lsn is {}", imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn ); @@ -1799,7 +1799,7 @@ impl Timeline { // is 102, then it might not have been fully flushed to disk // before crash. if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 { - warn!( + info!( "found future delta layer {} on timeline {} remote_consistent_lsn is {}", deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn ); diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py index 6e2a0622f1..f58abb4575 100644 --- a/test_runner/regress/test_gc_cutoff.py +++ b/test_runner/regress/test_gc_cutoff.py @@ -14,10 +14,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() - # These warnings are expected, when the pageserver is restarted abruptly - env.pageserver.allowed_errors.append(".*found future image layer.*") - env.pageserver.allowed_errors.append(".*found future delta layer.*") - pageserver_http = env.pageserver.http_client() # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 6da5503fb1..b8ddbe3ec1 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -72,10 +72,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - # These warnings are expected, when the pageserver is restarted abruptly - env.pageserver.allowed_errors.append(".*found future image layer.*") - env.pageserver.allowed_errors.append(".*found future delta layer.*") - # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. tenant, _ = env.neon_cli.create_tenant( diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 552825cf08..9d7a4a8fd6 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -15,10 +15,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.is_testing_enabled_or_skip() - # These warnings are expected, when the pageserver is restarted abruptly - env.pageserver.allowed_errors.append(".*found future delta layer.*") - env.pageserver.allowed_errors.append(".*found future image layer.*") - # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 43f0fb718f..7c04ed9017 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -348,9 +348,6 @@ def test_remote_storage_upload_queue_retries( # XXX: should vary this test to selectively fail just layer uploads, index uploads, deletions # but how do we validate the result after restore? - # these are always possible when we do an immediate stop. perhaps something with compacting has changed since. - env.pageserver.allowed_errors.append(r".*found future (delta|image) layer.*") - env.pageserver.stop(immediate=True) env.endpoints.stop_all()