mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 21:40:39 +00:00
Misc test flakyness fixes (#5233)
Assorted flakyness fixes from #5198, might not be flaky on `main`. Migrate some tests using neon_simple_env to just neon_env_builder and using initial_tenant to make flakyness understanding easier. (Did not understand the flakyness of `test_timeline_create_break_after_uninit_mark`.) `test_download_remote_layers_api` is flaky because we have no atomic "wait for WAL, checkpoint, wait for upload and do not receive any more WAL". `test_tenant_size` fixes are just boilerplate which should had always existed; we should wait for the tenant to be active. similarly for `test_timeline_delete`. `test_timeline_size_post_checkpoint` fails often for me with reading zero from metrics. Give it a few attempts.
This commit is contained in:
@@ -122,8 +122,8 @@ def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
|
||||
future.result()
|
||||
|
||||
|
||||
def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
@@ -133,7 +133,7 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
|
||||
]
|
||||
)
|
||||
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
|
||||
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
||||
@@ -160,11 +160,11 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
|
||||
), "pageserver should clean its temp timeline files on timeline creation failure"
|
||||
|
||||
|
||||
def test_timeline_create_break_after_uninit_mark(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
|
||||
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
||||
|
||||
@@ -390,9 +390,19 @@ def test_download_remote_layers_api(
|
||||
wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
|
||||
|
||||
###### Phase 1: exercise download error code path
|
||||
|
||||
# comparison here is requiring the size to be at least the previous size, because it's possible received WAL after last_flush_lsn_upload
|
||||
# witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test.
|
||||
this_time = get_api_current_physical_size()
|
||||
assert (
|
||||
filled_current_physical == get_api_current_physical_size()
|
||||
filled_current_physical <= this_time
|
||||
), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
|
||||
if filled_current_physical != this_time:
|
||||
log.info(
|
||||
f"fixing up filled_current_physical from {filled_current_physical} to {this_time} ({this_time - filled_current_physical})"
|
||||
)
|
||||
filled_current_physical = this_time
|
||||
|
||||
post_unlink_size = get_resident_physical_size()
|
||||
log.info(f"post_unlink_size: {post_unlink_size}")
|
||||
assert (
|
||||
|
||||
@@ -11,7 +11,10 @@ from fixtures.neon_fixtures import (
|
||||
wait_for_wal_insert_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import timeline_delete_wait_completed
|
||||
from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
wait_until_tenant_active,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion, xfail_on_postgres
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
|
||||
@@ -517,6 +520,8 @@ def test_single_branch_get_tenant_size_grows(
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
wait_until_tenant_active(http_client, tenant_id)
|
||||
|
||||
size_after = http_client.tenant_size(tenant_id)
|
||||
size_debug = http_client.tenant_size_debug(tenant_id)
|
||||
size_debug_file.write(size_debug)
|
||||
@@ -624,6 +629,8 @@ def test_get_tenant_size_with_multiple_branches(
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
wait_until_tenant_active(http_client, tenant_id)
|
||||
|
||||
# chance of compaction and gc on startup might have an effect on the
|
||||
# tenant_size but so far this has been reliable, even though at least gc
|
||||
# and tenant_size race for the same locks
|
||||
|
||||
@@ -128,6 +128,8 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.pageserver.start()
|
||||
|
||||
wait_until_tenant_active(ps_http, env.initial_tenant)
|
||||
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
|
||||
|
||||
@@ -367,10 +367,13 @@ def test_timeline_physical_size_post_checkpoint(
|
||||
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_timeline_id)
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
)
|
||||
def check():
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
)
|
||||
|
||||
wait_until(10, 1, check)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
|
||||
Reference in New Issue
Block a user