From 702382e99a7d71ab5c4e2f4d4e33a0e0dc9e42d8 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 20 Sep 2023 13:34:44 +0300 Subject: [PATCH] Add check that WAL segments are identical after recovery. --- test_runner/fixtures/neon_fixtures.py | 14 +++++++++++ test_runner/regress/test_wal_acceptor.py | 32 +++++++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a9fe5e376e..2d73972eba 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2758,6 +2758,20 @@ class Safekeeper: def data_dir(self) -> str: return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") + def timeline_dir(self, tenant_id, timeline_id) -> str: + return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id)) + + def list_segments(self, tenant_id, timeline_id) -> List[str]: + """ + Get list of segment names of the given timeline. + """ + tli_dir = self.timeline_dir(tenant_id, timeline_id) + segments = [] + for _, _, filenames in os.walk(tli_dir): + segments.extend([f for f in filenames if f != "safekeeper.control"]) + segments.sort() + return segments + @dataclass class SafekeeperTimelineStatus: diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 0b97ddf048..fd6baa22f3 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1,3 +1,4 @@ +import filecmp import os import pathlib import random @@ -1060,7 +1061,36 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id), "flush_lsn to get aligned", ) - # stop one of safekeepers which weren't recovering and insert a bit more + + # check that WALs are identic after recovery + segs = sk1.list_segments(tenant_id, timeline_id) + log.info(f"segs are {segs}") + + (_, mismatch, not_regular) = filecmp.cmpfiles( + sk1.timeline_dir(tenant_id, timeline_id), + sk2.timeline_dir(tenant_id, timeline_id), + segs, + shallow=False, + ) + log.info( + f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}" + ) + + for f in mismatch: + f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f) + f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f) + stdout_filename = "{}.filediff".format(f2) + + with open(stdout_filename, "w") as stdout_f: + subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) + subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + + cmd = "diff {}.hex {}.hex".format(f1, f2) + subprocess.run([cmd], stdout=stdout_f, shell=True) + + assert (mismatch, not_regular) == ([], []) + + # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit env.safekeepers[2].stop() endpoint = env.endpoints.create_start("test_peer_recovery") endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")