Misc test flakyness fixes (#5233)

Assorted flakyness fixes from #5198, might not be flaky on `main`.

Migrate some tests using neon_simple_env to just neon_env_builder and
using initial_tenant to make flakyness understanding easier. (Did not
understand the flakyness of
`test_timeline_create_break_after_uninit_mark`.)

`test_download_remote_layers_api` is flaky because we have no atomic
"wait for WAL, checkpoint, wait for upload and do not receive any more
WAL".

`test_tenant_size` fixes are just boilerplate which should had always
existed; we should wait for the tenant to be active. similarly for
`test_timeline_delete`.

`test_timeline_size_post_checkpoint` fails often for me with reading
zero from metrics. Give it a few attempts.
This commit is contained in:
Joonas Koivunen
2023-09-11 11:42:49 +03:00
committed by GitHub
parent 999fe668e7
commit a55a78a453
5 changed files with 34 additions and 12 deletions

View File

@@ -122,8 +122,8 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
future.result()
def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
env = neon_simple_env
def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.extend(
@@ -133,7 +133,7 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
]
)
tenant_id, _ = env.neon_cli.create_tenant()
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
@@ -160,11 +160,11 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
), "pageserver should clean its temp timeline files on timeline creation failure"
def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv):
env = neon_simple_env
def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
tenant_id, _ = env.neon_cli.create_tenant()
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)

View File

@@ -390,9 +390,19 @@ def test_download_remote_layers_api(
wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
###### Phase 1: exercise download error code path
# comparison here is requiring the size to be at least the previous size, because it's possible received WAL after last_flush_lsn_upload
# witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test.
this_time = get_api_current_physical_size()
assert (
filled_current_physical == get_api_current_physical_size()
filled_current_physical <= this_time
), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
if filled_current_physical != this_time:
log.info(
f"fixing up filled_current_physical from {filled_current_physical} to {this_time} ({this_time - filled_current_physical})"
)
filled_current_physical = this_time
post_unlink_size = get_resident_physical_size()
log.info(f"post_unlink_size: {post_unlink_size}")
assert (

View File

@@ -11,7 +11,10 @@ from fixtures.neon_fixtures import (
wait_for_wal_insert_lsn,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import timeline_delete_wait_completed
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
wait_until_tenant_active,
)
from fixtures.pg_version import PgVersion, xfail_on_postgres
from fixtures.types import Lsn, TenantId, TimelineId
@@ -517,6 +520,8 @@ def test_single_branch_get_tenant_size_grows(
env.pageserver.stop()
env.pageserver.start()
wait_until_tenant_active(http_client, tenant_id)
size_after = http_client.tenant_size(tenant_id)
size_debug = http_client.tenant_size_debug(tenant_id)
size_debug_file.write(size_debug)
@@ -624,6 +629,8 @@ def test_get_tenant_size_with_multiple_branches(
env.pageserver.stop()
env.pageserver.start()
wait_until_tenant_active(http_client, tenant_id)
# chance of compaction and gc on startup might have an effect on the
# tenant_size but so far this has been reliable, even though at least gc
# and tenant_size race for the same locks

View File

@@ -128,6 +128,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
env.pageserver.stop(immediate=True)
env.pageserver.start()
wait_until_tenant_active(ps_http, env.initial_tenant)
with pytest.raises(
PageserverApiException,
match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",

View File

@@ -367,10 +367,13 @@ def test_timeline_physical_size_post_checkpoint(
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
assert_physical_size_invariants(
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
remote_storage_kind,
)
def check():
assert_physical_size_invariants(
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
remote_storage_kind,
)
wait_until(10, 1, check)
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])