From 99726495c79941efdf5ccc695ccb6a6ad046ac7e Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 30 May 2025 13:14:36 +0100 Subject: [PATCH] test: allow list overly eager storcon finalization (#12055) ## Problem I noticed a small percentage of flakes on some import tests. They were all instances of the storage controller being too eager on the finalization. As a refresher: the pageserver notifies the storage controller that it's done from the import task and the storage controller has to call back into it in order to finalize the import. The pageserver checks that the import task is done before serving that request. Hence, we can get this race. In practice, this has no impact since the storage controller will simply retry. ## Summary of changes Allow list such cases --- test_runner/regress/test_import_pgdata.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index ba60c3caa6..8d4f908cc0 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -287,6 +287,17 @@ def test_pgdata_import_smoke( with pytest.raises(psycopg2.errors.UndefinedTable): br_initdb_endpoint.safe_psql(f"select * from {workload.table}") + # The storage controller might be overly eager and attempt to finalize + # the import before the task got a chance to exit. + env.storage_controller.allowed_errors.extend( + [ + ".*Call to node.*management API.*failed.*Import task still running.*", + ] + ) + + for ps in env.pageservers: + ps.allowed_errors.extend([".*Error processing HTTP request.*Import task not done yet.*"]) + @run_only_on_default_postgres(reason="PG version is irrelevant here") def test_import_completion_on_restart( @@ -471,6 +482,17 @@ def test_import_respects_timeline_lifecycle( else: raise RuntimeError(f"{action} param not recognized") + # The storage controller might be overly eager and attempt to finalize + # the import before the task got a chance to exit. + env.storage_controller.allowed_errors.extend( + [ + ".*Call to node.*management API.*failed.*Import task still running.*", + ] + ) + + for ps in env.pageservers: + ps.allowed_errors.extend([".*Error processing HTTP request.*Import task not done yet.*"]) + @skip_in_debug_build("Validation query takes too long in debug builds") def test_import_chaos(