From a55a78a4535179d0241c49fd8cb3fdd9a0a5f045 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 11 Sep 2023 11:42:49 +0300 Subject: [PATCH] Misc test flakyness fixes (#5233) Assorted flakyness fixes from #5198, might not be flaky on `main`. Migrate some tests using neon_simple_env to just neon_env_builder and using initial_tenant to make flakyness understanding easier. (Did not understand the flakyness of `test_timeline_create_break_after_uninit_mark`.) `test_download_remote_layers_api` is flaky because we have no atomic "wait for WAL, checkpoint, wait for upload and do not receive any more WAL". `test_tenant_size` fixes are just boilerplate which should had always existed; we should wait for the tenant to be active. similarly for `test_timeline_delete`. `test_timeline_size_post_checkpoint` fails often for me with reading zero from metrics. Give it a few attempts. --- test_runner/regress/test_broken_timeline.py | 12 ++++++------ test_runner/regress/test_ondemand_download.py | 12 +++++++++++- test_runner/regress/test_tenant_size.py | 9 ++++++++- test_runner/regress/test_timeline_delete.py | 2 ++ test_runner/regress/test_timeline_size.py | 11 +++++++---- 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index d0f180c866..c446f865cb 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -122,8 +122,8 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): future.result() -def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() env.pageserver.allowed_errors.extend( @@ -133,7 +133,7 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): ] ) - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id = env.initial_tenant timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines" old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) @@ -160,11 +160,11 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv): ), "pageserver should clean its temp timeline files on timeline creation failure" -def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id = env.initial_tenant timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines" old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 259c6412ef..fd9f9d631d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -390,9 +390,19 @@ def test_download_remote_layers_api( wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) ###### Phase 1: exercise download error code path + + # comparison here is requiring the size to be at least the previous size, because it's possible received WAL after last_flush_lsn_upload + # witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test. + this_time = get_api_current_physical_size() assert ( - filled_current_physical == get_api_current_physical_size() + filled_current_physical <= this_time ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote" + if filled_current_physical != this_time: + log.info( + f"fixing up filled_current_physical from {filled_current_physical} to {this_time} ({this_time - filled_current_physical})" + ) + filled_current_physical = this_time + post_unlink_size = get_resident_physical_size() log.info(f"post_unlink_size: {post_unlink_size}") assert ( diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 25c6634108..49a6ca5a53 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -11,7 +11,10 @@ from fixtures.neon_fixtures import ( wait_for_wal_insert_lsn, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.pageserver.utils import ( + timeline_delete_wait_completed, + wait_until_tenant_active, +) from fixtures.pg_version import PgVersion, xfail_on_postgres from fixtures.types import Lsn, TenantId, TimelineId @@ -517,6 +520,8 @@ def test_single_branch_get_tenant_size_grows( env.pageserver.stop() env.pageserver.start() + wait_until_tenant_active(http_client, tenant_id) + size_after = http_client.tenant_size(tenant_id) size_debug = http_client.tenant_size_debug(tenant_id) size_debug_file.write(size_debug) @@ -624,6 +629,8 @@ def test_get_tenant_size_with_multiple_branches( env.pageserver.stop() env.pageserver.start() + wait_until_tenant_active(http_client, tenant_id) + # chance of compaction and gc on startup might have an effect on the # tenant_size but so far this has been reliable, even though at least gc # and tenant_size race for the same locks diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 1e38e0534c..0b79383442 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -128,6 +128,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv): env.pageserver.stop(immediate=True) env.pageserver.start() + wait_until_tenant_active(ps_http, env.initial_tenant) + with pytest.raises( PageserverApiException, match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found", diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index ccdfc18de8..3f379a173c 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -367,10 +367,13 @@ def test_timeline_physical_size_post_checkpoint( wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id) pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id) - assert_physical_size_invariants( - get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), - remote_storage_kind, - ) + def check(): + assert_physical_size_invariants( + get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), + remote_storage_kind, + ) + + wait_until(10, 1, check) @pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])