From be8a6bcdb4bb96699b24a541a4487c215a507778 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 20 Jan 2024 19:04:27 +0200 Subject: [PATCH] Fix the test --- pageserver/src/walingest.rs | 2 +- test_runner/fixtures/neon_fixtures.py | 17 +++++++++++++++++ test_runner/regress/test_next_xid.py | 23 +++++++++++++---------- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 83d41187d4..3a27d6effe 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -133,7 +133,7 @@ impl WalIngest { // was fixed fn reintroduce_bug_failpoint_activated() -> bool { fail::fail_point!("reintroduce-nextxid-update-bug", |_| { true }); - return false; + false } if decoded.xl_xid == pg_constants::INVALID_TRANSACTION_ID && reintroduce_bug_failpoint_activated() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d98aedf4d0..bda0145a2c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2998,6 +2998,23 @@ class Endpoint(PgProtocol): ): self.stop() + def log_contains(self, pattern: str) -> Optional[str]: + """Check that the compute log contains a line that matches the given regex""" + logfile = self.endpoint_path() / "compute.log" + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return None + + contains_re = re.compile(pattern) + + with logfile.open("r") as f: + for line in f: + if contains_re.search(line): + # found it! + return line + + return None + # Checkpoints running endpoint and returns pg_wal size in MB. def get_pg_wal_size(self): log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}') diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index b7388f8f6e..681216063c 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -8,6 +8,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn from fixtures.pageserver.utils import ( wait_for_last_record_lsn, + wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn, TenantId, TimelineId @@ -377,18 +378,19 @@ def test_one_off_hack_for_nextxid_bug( # A checkpoint writes a WAL record with xl_xid=0. Many other WAL # records would have the same effect. cur.execute("checkpoint") - cur.execute("INSERT INTO t VALUES ('before fix')") - wait_for_wal_insert_lsn(env, endpoint, tenant, timeline) # Ok, the nextXid in the pageserver at this LSN should now be incorrectly # set to 1:1024. Remember this LSN. broken_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + # Ensure that the broken checkpoint data has reached permanent storage + ps_http.timeline_checkpoint(tenant, timeline) + wait_for_upload(ps_http, tenant, timeline, broken_lsn) + # Now fix the bug, and generate some WAL with XIDs ps_http.configure_failpoints(("reintroduce-nextxid-update-bug", "off")) cur.execute("INSERT INTO t VALUES ('after fix')") fixed_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) - wait_for_wal_insert_lsn(env, endpoint, tenant, timeline) log.info(f"nextXid was broken by {broken_lsn}, and fixed again by {fixed_lsn}") @@ -399,13 +401,14 @@ def test_one_off_hack_for_nextxid_bug( env.neon_cli.create_branch( "at-broken-lsn", branch_name, ancestor_start_lsn=broken_lsn, tenant_id=tenant ) - with pytest.raises(RuntimeError, match="compute startup timed out; still in Init state"): - env.endpoints.create_start( - "at-broken-lsn", - endpoint_id="ep-at-broken-lsn", - tenant_id=tenant, - ) - log.error("starting endpoint at broken LSN succeeded unexpectedly") + endpoint_broken = env.endpoints.create( + "at-broken-lsn", + endpoint_id="ep-at-broken-lsn", + tenant_id=tenant, + ) + with pytest.raises(RuntimeError, match="Postgres exited unexpectedly with code 1"): + endpoint_broken.start() + assert endpoint_broken.log_contains('Could not open file "pg_xact/0000": No such file or directory') # But after the bug was fixed, the one-off hack fixed the timeline, # and a later LSN works.