tests: try to make restored-datadir comparison tests not flaky v2

This test occasionally fails with a difference in "pg_xact/0000" file
between the local and restored datadirs. My hypothesis is that
something changed in the database between the last explicit checkpoint
and the shutdown. I suspect autovacuum, it could certainly create
transactions.

To fix, be more precise about the point in time that we compare. Shut
down the endpoint first, then read the last LSN (i.e. the shutdown
checkpoint's LSN), from the local disk with pg_controldata. And use
exactly that LSN in the basebackup.

Closes #559
This commit is contained in:
Heikki Linnakangas
2024-02-07 19:58:08 +02:00
committed by Arseny Sher
parent 74d09b78c7
commit 621ea2ec44

View File

@@ -3890,24 +3890,21 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
# pg is the existing and running compute node, that we want to compare with a basebackup # pg is the existing and running compute node, that we want to compare with a basebackup
def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint): def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
# Get the timeline ID. We need it for the 'basebackup' command # Get the timeline ID. We need it for the 'basebackup' command
timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
# many tests already checkpoint, but do it just in case
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("CHECKPOINT")
# wait for pageserver to catch up
wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
# stop postgres to ensure that files won't change # stop postgres to ensure that files won't change
endpoint.stop() endpoint.stop()
# Read the shutdown checkpoint's LSN
checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path())
# Take a basebackup from pageserver # Take a basebackup from pageserver
restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir" restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
restored_dir_path.mkdir(exist_ok=True) restored_dir_path.mkdir(exist_ok=True)
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
psql_path = os.path.join(pg_bin.pg_bin_path, "psql") psql_path = os.path.join(pg_bin.pg_bin_path, "psql")
pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"] pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3915,7 +3912,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
{psql_path} \ {psql_path} \
--no-psqlrc \ --no-psqlrc \
postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \ postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \
-c 'basebackup {endpoint.tenant_id} {timeline_id}' \ -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}' \
| tar -x -C {restored_dir_path} | tar -x -C {restored_dir_path}
""" """