From 7e55497e131f2f26a16ae22bff80cac11951cdd4 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 8 May 2025 14:00:45 +0400 Subject: [PATCH] tests: flush wal before waiting for last record lsn (#11726) ## Problem Compute may flush WAL on page boundaries, leaving some records partially flushed for a long time. It leads to `wait_for_last_flush_lsn` stuck waiting for this partial LSN. - Closes: https://github.com/neondatabase/cloud/issues/27876 ## Summary of changes - Flush WAL via CHECKPOINT after requesting current_wal_lsn to make sure that the record we point to is flushed in full - Use proper endpoint in `test_timeline_detach_with_aux_files_with_detach_v1` --- test_runner/fixtures/neon_fixtures.py | 7 +++++++ test_runner/regress/test_timeline_detach_ancestor.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 85ad49bb4f..370eca5130 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -5477,6 +5477,13 @@ def wait_for_last_flush_lsn( if last_flush_lsn is None: last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + # The last_flush_lsn may not correspond to a record boundary. + # For example, if the compute flushed WAL on a page boundary, + # the remaining part of the record might not be flushed for a long time. + # This would prevent the pageserver from reaching last_flush_lsn promptly. + # To ensure the rest of the record reaches the pageserver quickly, + # we forcibly flush the WAL by using CHECKPOINT. + endpoint.safe_psql("CHECKPOINT") results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index a71652af8a..d42c5d403e 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1822,7 +1822,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( endpoint2.safe_psql( "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" ) - lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + lsn3 = wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( ["pg_replslot/test_slot_restore/state"] @@ -1839,7 +1839,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( assert all_reparented == set([]) # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] ), "main branch unaffected"