Fix the test

This commit is contained in:
Heikki Linnakangas
2024-01-20 19:04:27 +02:00
parent 71c7ba756d
commit be8a6bcdb4
3 changed files with 31 additions and 11 deletions

View File

@@ -133,7 +133,7 @@ impl WalIngest {
// was fixed
fn reintroduce_bug_failpoint_activated() -> bool {
fail::fail_point!("reintroduce-nextxid-update-bug", |_| { true });
return false;
false
}
if decoded.xl_xid == pg_constants::INVALID_TRANSACTION_ID
&& reintroduce_bug_failpoint_activated()

View File

@@ -2998,6 +2998,23 @@ class Endpoint(PgProtocol):
):
self.stop()
def log_contains(self, pattern: str) -> Optional[str]:
"""Check that the compute log contains a line that matches the given regex"""
logfile = self.endpoint_path() / "compute.log"
if not logfile.exists():
log.warning(f"Skipping log check: {logfile} does not exist")
return None
contains_re = re.compile(pattern)
with logfile.open("r") as f:
for line in f:
if contains_re.search(line):
# found it!
return line
return None
# Checkpoints running endpoint and returns pg_wal size in MB.
def get_pg_wal_size(self):
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')

View File

@@ -8,6 +8,7 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
from fixtures.pageserver.utils import (
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import Lsn, TenantId, TimelineId
@@ -377,18 +378,19 @@ def test_one_off_hack_for_nextxid_bug(
# A checkpoint writes a WAL record with xl_xid=0. Many other WAL
# records would have the same effect.
cur.execute("checkpoint")
cur.execute("INSERT INTO t VALUES ('before fix')")
wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
# Ok, the nextXid in the pageserver at this LSN should now be incorrectly
# set to 1:1024. Remember this LSN.
broken_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
# Ensure that the broken checkpoint data has reached permanent storage
ps_http.timeline_checkpoint(tenant, timeline)
wait_for_upload(ps_http, tenant, timeline, broken_lsn)
# Now fix the bug, and generate some WAL with XIDs
ps_http.configure_failpoints(("reintroduce-nextxid-update-bug", "off"))
cur.execute("INSERT INTO t VALUES ('after fix')")
fixed_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
log.info(f"nextXid was broken by {broken_lsn}, and fixed again by {fixed_lsn}")
@@ -399,13 +401,14 @@ def test_one_off_hack_for_nextxid_bug(
env.neon_cli.create_branch(
"at-broken-lsn", branch_name, ancestor_start_lsn=broken_lsn, tenant_id=tenant
)
with pytest.raises(RuntimeError, match="compute startup timed out; still in Init state"):
env.endpoints.create_start(
"at-broken-lsn",
endpoint_id="ep-at-broken-lsn",
tenant_id=tenant,
)
log.error("starting endpoint at broken LSN succeeded unexpectedly")
endpoint_broken = env.endpoints.create(
"at-broken-lsn",
endpoint_id="ep-at-broken-lsn",
tenant_id=tenant,
)
with pytest.raises(RuntimeError, match="Postgres exited unexpectedly with code 1"):
endpoint_broken.start()
assert endpoint_broken.log_contains('Could not open file "pg_xact/0000": No such file or directory')
# But after the bug was fixed, the one-off hack fixed the timeline,
# and a later LSN works.