diff --git a/pageserver/src/tenant_tasks.rs b/pageserver/src/tenant_tasks.rs index a24bdd5812..5a9c5aa3a5 100644 --- a/pageserver/src/tenant_tasks.rs +++ b/pageserver/src/tenant_tasks.rs @@ -72,8 +72,6 @@ async fn compaction_loop(tenant_id: TenantId) { if let Err(e) = tenant.compaction_iteration() { sleep_duration = wait_duration; error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration); - #[cfg(feature = "testing")] - std::process::abort(); } // Sleep @@ -123,8 +121,6 @@ async fn gc_loop(tenant_id: TenantId) { { sleep_duration = wait_duration; error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration); - #[cfg(feature = "testing")] - std::process::abort(); } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f68c6a25db..051c140836 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -784,6 +784,8 @@ class NeonEnvBuilder: self.cleanup_remote_storage() + self.env.pageserver.assert_no_errors() + class NeonEnv: """ @@ -1723,6 +1725,43 @@ class NeonPageserver(PgProtocol): self.config_override = config_override self.version = env.get_pageserver_version() + # After a test finishes, we will scrape the log to see if there are any + # unexpected error messages. If your test expects an error, add it to + # 'allowed_errors' in the test with something like: + # + # env.pageserver.allowed_errors.append(".*could not open garage door.*") + # + # The entries in the list are regular experessions. + self.allowed_errors = [ + # All tests print these, when starting up or shutting down + ".*wal receiver task finished with an error: walreceiver connection handling failure.*", + ".*Shutdown task error: walreceiver connection handling failure.*", + ".*Etcd client error: grpc request error: status: Unavailable.*", + ".*query handler for .* failed: Connection reset by peer.*", + ".*serving compute connection task.*exited with error: Broken pipe.*", + ".*Connection aborted: error communicating with the server: Broken pipe.*", + ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*", + ".*Connection aborted: error communicating with the server: Connection reset by peer.*", + ".*kill_and_wait_impl.*: wait successful.*", + ".*end streaming to Some.*", + # safekeeper connection can fail with this, in the window between timeline creation + # and streaming start + ".*Failed to process query for timeline .*: state uninitialized, no data to read.*", + # Tests related to authentication and authorization print these + ".*Error processing HTTP request: Forbidden", + # intentional failpoints + ".*failpoint ", + # FIXME: there is a race condition between GC and detach, see + # https://github.com/neondatabase/neon/issues/2442 + ".*could not remove ephemeral file.*No such file or directory.*", + # FIXME: These need investigation + ".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*", + ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*", + ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", + ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", + ".*Removing intermediate uninit mark file.*", + ] + def start(self, overrides: Tuple[str, ...] = ()) -> "NeonPageserver": """ Start the page server. @@ -1771,6 +1810,26 @@ class NeonPageserver(PgProtocol): is_testing_enabled_or_skip=self.is_testing_enabled_or_skip, ) + def assert_no_errors(self): + logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r") + + error_or_warn = re.compile("ERROR|WARN") + errors = [] + while True: + line = logfile.readline() + if not line: + break + + if error_or_warn.search(line): + # It's an ERROR or WARN. Is it in the allow-list? + for a in self.allowed_errors: + if re.match(a, line): + break + else: + errors.append(line) + + assert not errors + def append_pageserver_param_overrides( params_to_update: List[str], diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index 12debe50eb..fad4b4c79e 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -116,6 +116,13 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() + env.pageserver.allowed_errors.extend( + [ + ".*invalid branch start lsn: less than latest GC cutoff.*", + ".*invalid branch start lsn: less than planned GC cutoff.*", + ] + ) + # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( conf={ diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 0e2a8b346b..a841e3ced2 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -13,6 +13,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() + env.pageserver.allowed_errors.append(".*invalid branch start lsn.*") + env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*") + # Branch at the point where only 100 rows were inserted env.neon_cli.create_branch("test_branch_behind") pgmain = env.postgres.create_start("test_branch_behind") diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index b747af4d09..cf7f4b8289 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -11,10 +11,17 @@ from fixtures.types import TenantId, TimelineId # Test restarting page server, while safekeeper and compute node keep # running. def test_broken_timeline(neon_env_builder: NeonEnvBuilder): - # One safekeeper is enough for this test. - neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [ + ".*No timelines to attach received.*", + ".*Failed to process timeline dir contents.*", + ".*Failed to load delta layer.*", + ".*Timeline .* was not found.*", + ] + ) + tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = [] for n in range(4): @@ -111,6 +118,13 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http = env.pageserver.http_client() + env.pageserver.allowed_errors.extend( + [ + ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*", + ".*Timeline got dropped without initializing, cleaning its files.*", + ] + ) + tenant_id, _ = env.neon_cli.create_tenant() timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 306aa84040..9ad8cd393f 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -50,6 +50,12 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o env = neon_env_builder.init_start() pg = env.postgres.create_start("main") + + # FIXME: Is this expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()]) pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()]) pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]) diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py index 22b77d2cf1..7fe77a7e85 100644 --- a/test_runner/regress/test_gc_cutoff.py +++ b/test_runner/regress/test_gc_cutoff.py @@ -9,6 +9,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin # test anyway, so it doesn't need any special attention here. def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() + + # These warnings are expected, when the pageserver is restarted abruptly + env.pageserver.allowed_errors.append(".*found future image layer.*") + env.pageserver.allowed_errors.append(".*found future delta layer.*") + pageserver_http = env.pageserver.http_client() # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index ced5e18406..fbc893f312 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -76,6 +76,26 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build env = neon_env_builder.init_start() env.pageserver.http_client().tenant_create(tenant) + env.pageserver.allowed_errors.extend( + [ + ".*error importing base backup .*", + ".*Timeline got dropped without initializing, cleaning its files.*", + ".*Removing intermediate uninit mark file.*", + ".*InternalServerError.*timeline not found.*", + ".*InternalServerError.*Tenant .* not found.*", + ".*InternalServerError.*Timeline .* not found.*", + ".*InternalServerError.*Cannot delete timeline which has child timelines.*", + ] + ) + + # FIXME: we should clean up pageserver to not print this + env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*") + + # FIXME: Is this expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + def import_tar(base, wal): env.neon_cli.raw_cli( [ @@ -122,6 +142,11 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu neon_env_builder.enable_local_fs_remote_storage() env = neon_env_builder.init_start() + # FIXME: Is this expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") pg = env.postgres.create_start("test_import_from_pageserver_small") diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index eac5e6e61d..ad06634ae9 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -67,6 +67,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + # These warnings are expected, when the pageserver is restarted abruptly + env.pageserver.allowed_errors.append(".*found future image layer.*") + env.pageserver.allowed_errors.append(".*found future delta layer.*") + # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. tenant, _ = env.neon_cli.create_tenant( diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index beaae0351b..1e49c3b69f 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -143,6 +143,8 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_read_validation_neg", "empty") + env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") + pg = env.postgres.create_start("test_read_validation_neg") log.info("postgres is running on 'test_read_validation_neg' branch") diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index dfa57aec25..62c3ead0a7 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -17,6 +17,8 @@ def test_readonly_node(neon_simple_env: NeonEnv): pgmain = env.postgres.create_start("test_readonly_node") log.info("postgres is running on 'test_readonly_node' branch") + env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*") + main_pg_conn = pgmain.connect() main_cur = main_pg_conn.cursor() diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index e70b1351ba..1e93958e98 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -17,6 +17,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): neon_env_builder.start() + # These warnings are expected, when the pageserver is restarted abruptly + env.pageserver.allowed_errors.append(".*found future delta layer.*") + env.pageserver.allowed_errors.append(".*found future image layer.*") + # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 4fb5a5406d..ecca496c7c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -56,6 +56,17 @@ def test_remote_storage_backup_and_restore( ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() + + # FIXME: Is this expected? + env.pageserver.allowed_errors.append( + ".*marking .* as locally complete, while it doesnt exist in remote index.*" + ) + env.pageserver.allowed_errors.append(".*No timelines to attach received.*") + + env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*") + env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*") + env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*") + pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index dc4cd2e37e..f66bacc4f7 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -20,6 +20,8 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() + env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found in the local state") + # first check for non existing tenant tenant_id = TenantId.generate() with pytest.raises( @@ -28,6 +30,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): ): pageserver_http.tenant_detach(tenant_id) + # the error will be printed to the log too + env.pageserver.allowed_errors.append(".*Tenant not found for id.*") + # create new nenant tenant_id, timeline_id = env.neon_cli.create_tenant() @@ -50,6 +55,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) + # the error will be printed to the log too + env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") + # try to concurrently run gc and detach gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id)) gc_thread.start() diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index aec45307f7..c4b3b28f34 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -259,6 +259,11 @@ def test_tenant_relocation( env = neon_env_builder.init_start() + # FIXME: Is this expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage" diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4ffea60950..6d153b42bc 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -25,6 +25,13 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) initial_tenant_dirs = [d for d in tenants_dir.iterdir()] + neon_simple_env.pageserver.allowed_errors.extend( + [ + ".*Failed to create directory structure for tenant .*, cleaning tmp data.*", + ".*Failed to fsync removed temporary tenant directory .*", + ] + ) + pageserver_http = neon_simple_env.pageserver.http_client() pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): @@ -206,6 +213,13 @@ def test_pageserver_with_empty_tenants( ) env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append( + ".*marking .* as locally complete, while it doesnt exist in remote index.*" + ) + env.pageserver.allowed_errors.append(".*Tenant .* has no timelines directory.*") + env.pageserver.allowed_errors.append(".*No timelines to attach received.*") + client = env.pageserver.http_client() tenant_without_timelines_dir = env.initial_tenant diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 9a4cbe135b..8fd28cf53e 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -66,6 +66,11 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem env = neon_env_builder.init_start() + # FIXME: Is this expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + tenants_pgs: List[Tuple[TenantId, Postgres]] = [] for _ in range(1, 5): @@ -117,6 +122,13 @@ def test_tenants_attached_after_download( ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() + + # FIXME: Are these expected? + env.pageserver.allowed_errors.append(".*No timelines to attach received.*") + env.pageserver.allowed_errors.append( + ".*marking .* as locally complete, while it doesnt exist in remote index.*" + ) + pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") @@ -209,6 +221,16 @@ def test_tenant_upgrades_index_json_from_v0( # launch pageserver, populate the default tenants timeline, wait for it to be uploaded, # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade env = neon_env_builder.init_start() + + # FIXME: Are these expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + env.pageserver.allowed_errors.append(".*No timelines to attach received.*") + env.pageserver.allowed_errors.append( + ".*Failed to get local tenant state: Tenant .* not found in the local state.*" + ) + pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") @@ -315,6 +337,20 @@ def test_tenant_redownloads_truncated_file_on_startup( ) env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append( + ".*Redownloading locally existing .* due to size mismatch.*" + ) + env.pageserver.allowed_errors.append( + ".*Downloaded layer exists already but layer file metadata mismatches.*" + ) + + # FIXME: Are these expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + env.pageserver.allowed_errors.append(".*No timelines to attach received.*") + pageserver_http = env.pageserver.http_client() pg = env.postgres.create_start("main") diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 450f7f2381..d8f9ef2f89 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -7,6 +7,11 @@ from fixtures.utils import wait_until def test_timeline_delete(neon_simple_env: NeonEnv): env = neon_simple_env + env.pageserver.allowed_errors.append(".*Timeline .* was not found.*") + env.pageserver.allowed_errors.append(".*timeline not found.*") + env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*") + env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*") + ps_http = env.pageserver.http_client() # first try to delete non existing timeline diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2b43f46fd3..3945376e5e 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -263,6 +263,12 @@ def test_broker(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_broker", "main") + + # FIXME: Is this expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + pg = env.postgres.create_start("test_broker") pg.safe_psql("CREATE TABLE t(key int primary key, value text)") @@ -306,6 +312,11 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() + # FIXME: Is this expected? + env.pageserver.allowed_errors.append( + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + ) + env.neon_cli.create_branch("test_safekeepers_wal_removal") pg = env.postgres.create_start("test_safekeepers_wal_removal") @@ -1081,6 +1092,14 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() + # FIXME: are these expected? + env.pageserver.allowed_errors.extend( + [ + ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*", + ".*end streaming to Some.*", + ] + ) + # Create two tenants: one will be deleted, other should be preserved. tenant_id = env.initial_tenant timeline_id_1 = env.neon_cli.create_branch("br1") # Active, delete explicitly diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index c79aea35da..aaaa8893a5 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -22,6 +22,8 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese # as a zombie process. def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + # We intentionally test for a non-existent tenant. + env.pageserver.allowed_errors.append(".*Tenant not found.*") pageserver_http = env.pageserver.http_client() pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text())