Misc test flakyness fixes (#5233)

Assorted flakyness fixes from #5198, might not be flaky on `main`. Migrate some tests using neon_simple_env to just neon_env_builder and using initial_tenant to make flakyness understanding easier. (Did not understand the flakyness of `test_timeline_create_break_after_uninit_mark`.) `test_download_remote_layers_api` is flaky because we have no atomic "wait for WAL, checkpoint, wait for upload and do not receive any more WAL". `test_tenant_size` fixes are just boilerplate which should had always existed; we should wait for the tenant to be active. similarly for `test_timeline_delete`. `test_timeline_size_post_checkpoint` fails often for me with reading zero from metrics. Give it a few attempts.
2026-06-03 21:40:39 +00:00 · 2023-09-11 11:42:49 +03:00
parent 999fe668e7
commit a55a78a453
5 changed files with 34 additions and 12 deletions
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -122,8 +122,8 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
            future.result()


-def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

    env.pageserver.allowed_errors.extend(
@@ -133,7 +133,7 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
        ]
    )

-    tenant_id, _ = env.neon_cli.create_tenant()
+    tenant_id = env.initial_tenant

    timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
    old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
@@ -160,11 +160,11 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
    ), "pageserver should clean its temp timeline files on timeline creation failure"


-def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv):
-    env = neon_simple_env
+def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

-    tenant_id, _ = env.neon_cli.create_tenant()
+    tenant_id = env.initial_tenant

    timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
    old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -390,9 +390,19 @@ def test_download_remote_layers_api(
    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))

    ###### Phase 1: exercise download error code path
+
+    # comparison here is requiring the size to be at least the previous size, because it's possible received WAL after last_flush_lsn_upload
+    # witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test.
+    this_time = get_api_current_physical_size()
    assert (
-        filled_current_physical == get_api_current_physical_size()
+        filled_current_physical <= this_time
    ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
+    if filled_current_physical != this_time:
+        log.info(
+            f"fixing up filled_current_physical from {filled_current_physical} to {this_time} ({this_time - filled_current_physical})"
+        )
+        filled_current_physical = this_time
+
    post_unlink_size = get_resident_physical_size()
    log.info(f"post_unlink_size: {post_unlink_size}")
    assert (
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -11,7 +11,10 @@ from fixtures.neon_fixtures import (
    wait_for_wal_insert_lsn,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.utils import timeline_delete_wait_completed
+from fixtures.pageserver.utils import (
+    timeline_delete_wait_completed,
+    wait_until_tenant_active,
+)
 from fixtures.pg_version import PgVersion, xfail_on_postgres
 from fixtures.types import Lsn, TenantId, TimelineId

@@ -517,6 +520,8 @@ def test_single_branch_get_tenant_size_grows(
    env.pageserver.stop()
    env.pageserver.start()

+    wait_until_tenant_active(http_client, tenant_id)
+
    size_after = http_client.tenant_size(tenant_id)
    size_debug = http_client.tenant_size_debug(tenant_id)
    size_debug_file.write(size_debug)
@@ -624,6 +629,8 @@ def test_get_tenant_size_with_multiple_branches(
    env.pageserver.stop()
    env.pageserver.start()

+    wait_until_tenant_active(http_client, tenant_id)
+
    # chance of compaction and gc on startup might have an effect on the
    # tenant_size but so far this has been reliable, even though at least gc
    # and tenant_size race for the same locks
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -128,6 +128,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
    env.pageserver.stop(immediate=True)
    env.pageserver.start()

+    wait_until_tenant_active(ps_http, env.initial_tenant)
+
    with pytest.raises(
        PageserverApiException,
        match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -367,10 +367,13 @@ def test_timeline_physical_size_post_checkpoint(
    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
    pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)

-    assert_physical_size_invariants(
-        get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
-        remote_storage_kind,
-    )
+    def check():
+        assert_physical_size_invariants(
+            get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
+            remote_storage_kind,
+        )
+
+    wait_until(10, 1, check)


@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])