From 4ee11d9dfcc97a71e07db73cdea70f8d39eb0d48 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sat, 20 Jan 2024 17:58:35 +0200 Subject: [PATCH] Retroactively fix the nextXid on a known broken timeline This one particular timeline in production hit the nextXid bug. Add a one-off hack that will fix the nextXid on that particular timeline. --- pageserver/src/walingest.rs | 53 ++++++- test_runner/regress/test_next_xid.py | 210 ++++++++++++++++++++++++++- 2 files changed, 258 insertions(+), 5 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 5a6f9a590f..af97905202 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -26,8 +26,11 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; +use std::str::FromStr; + use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; +use hex::FromHex; use tracing::*; use utils::failpoint_support; @@ -44,10 +47,11 @@ use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::v14::xlog_utils::*; -use postgres_ffi::v14::CheckPoint; +use postgres_ffi::v14::{CheckPoint, bindings::FullTransactionId}; use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; +use utils::id::{TenantId, TimelineId}; pub struct WalIngest { shard: ShardIdentity, @@ -109,6 +113,53 @@ impl WalIngest { self.checkpoint_modified = true; } + // BEGIN ONE-OFF HACK + // + // We had a bug where we incorrectly passed 0 to update_next_xid(). That was + // harmless as long as nextXid was < 2^31, because 0 looked like a very old + // XID. But once nextXid reaches 2^31, 0 starts to look like a very new XID, and + // we incorrectly bumped up nextXid to the next epoch, to value '1:1024' + // + // We have one known timeline in production where that happened. This is a one-off + // fix to fix that damage. The last WAL record on that timeline as of this writing + // is this: + // + // rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 35A/E32D86D8, prev 35A/E32D86B0, desc: RUNNING_XACTS nextXid 2325447052 latestCompletedXid 2325447051 oldestRunningXid 2325447052 + // + // So on that particular timeline, before that LSN, fix the incorrectly set + // nextXid to the nextXid value from that record, plus 1000 to give some safety + // margin. + + // For testing this hack, this failpoint temporarily re-introduces the bug that + // was fixed + fn reintroduce_bug_failpoint_activated() -> bool { + fail::fail_point!("reintroduce-nextxid-update-bug", |_| { true }); + return false; + } + if decoded.xl_xid == pg_constants::INVALID_TRANSACTION_ID + && reintroduce_bug_failpoint_activated() + && self.checkpoint.update_next_xid(decoded.xl_xid) + { + info!("failpoint: Incorrectly updated nextXid at LSN {} to {}", lsn, self.checkpoint.nextXid.value); + self.checkpoint_modified = true; + } + + if self.checkpoint.nextXid.value == 4294968320 && // 1::1024, the incorrect value + modification.tline.tenant_shard_id.tenant_id == TenantId::from_hex("df254570a4f603805528b46b0d45a76c").unwrap() && + modification.tline.timeline_id == TimelineId::from_hex("e592ec1cc0e5a41f69060d0c9efb07ed").unwrap() && + lsn < Lsn::from_str("35A/E32D9000").unwrap() && + !reintroduce_bug_failpoint_activated() + { + // This is the last nextXid value from the last RUNNING_XACTS record, at the + // end of the WAL as of this writing. + self.checkpoint.nextXid = FullTransactionId { + value: 2325447052 + 1000 + }; + self.checkpoint_modified = true; + warn!("nextXid fixed by one-off hack at LSN {}", lsn); + } + // END ONE-OFF HACK + match decoded.xl_rmid { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { // Heap AM records need some special handling, because they modify VM pages diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index da2580dbf9..b5d109f45e 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -1,5 +1,6 @@ import json import os +import pytest import time from pathlib import Path @@ -98,7 +99,7 @@ def test_import_at_2bil( vanilla_pg.safe_psql("CREATE TABLE t (t text);") vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')") - endpoint_id = "ep-import_from_vanilla" + branch = "import_from_vanilla" tenant = TenantId.generate() timeline = TimelineId.generate() @@ -138,7 +139,7 @@ def test_import_at_2bil( "--timeline-id", str(timeline), "--node-name", - endpoint_id, + branch_name, "--base-lsn", start_lsn, "--base-tarfile", @@ -157,7 +158,8 @@ def test_import_at_2bil( wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn)) endpoint = env.endpoints.create_start( - endpoint_id, + branch_name, + endpoint_id="ep-import_from_vanilla", tenant_id=tenant, config_lines=[ "log_autovacuum_min_duration = 0", @@ -166,7 +168,6 @@ def test_import_at_2bil( ) assert endpoint.safe_psql("select count(*) from t") == [(1,)] - # Ok, consume conn = endpoint.connect() cur = conn.cursor() @@ -218,3 +219,204 @@ def test_import_at_2bil( cur = conn.cursor() cur.execute("SELECT count(*) from t") assert cur.fetchone() == (10000 + 1,) + + +# This is a followup to the test_import_at_2bil test. +# +# Use a failpoint to reintroduce the bug that test_import_at_2bil also +# tests. Then, after the damage has been done, clear the failpoint to +# fix the bug. Check that the one-off hack that we added for a particular +# timeline that hit this in production fixes the broken timeline. +def test_one_off_hack_for_nextxid_bug( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_distrib_dir: Path, + pg_bin, + vanilla_pg, +): + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # We begin with the old bug still present, to create a broken timeline + ps_http.configure_failpoints(("reintroduce-nextxid-update-bug", "return(true)")) + + # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. + # PgBin sets it automatically, but here we need to pipe psql output to the tar command. + psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} + + # Reset the vanilla Postgres instance to somewhat before 2 billion transactions, + # and around the same LSN as with the production timeline. + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [ + pg_resetwal_path, + "--next-transaction-id=2129920000", + "-l", + "000000010000035A000000E0", + "-D", + str(vanilla_pg.pgdatadir), + ] + pg_bin.run_capture(cmd, env=psql_env) + + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + vanilla_pg.safe_psql( + """create table tt as select 'long string to consume some space' || g + from generate_series(1,300000) g""" + ) + assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)] + vanilla_pg.safe_psql("CREATE TABLE t (t text);") + vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')") + + branch_name = "import_from_vanilla" + # This is the tenant/timeline that the one-off hack targets + tenant = "df254570a4f603805528b46b0d45a76c" + timeline = "e592ec1cc0e5a41f69060d0c9efb07ed" + + env.pageserver.tenant_create(tenant) + + # Take basebackup + basebackup_dir = os.path.join(test_output_dir, "basebackup") + base_tar = os.path.join(basebackup_dir, "base.tar") + wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") + os.mkdir(basebackup_dir) + vanilla_pg.safe_psql("CHECKPOINT") + pg_bin.run( + [ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg.connstr(), + "-D", + basebackup_dir, + ] + ) + + # Get start_lsn and end_lsn + with open(os.path.join(basebackup_dir, "backup_manifest")) as f: + manifest = json.load(f) + start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] + end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] + + def import_tar(base, wal): + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + str(tenant), + "--timeline-id", + str(timeline), + "--node-name", + branch_name, + "--base-lsn", + start_lsn, + "--base-tarfile", + base, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal, + "--pg-version", + env.pg_version, + ] + ) + + # Importing correct backup works + import_tar(base_tar, wal_tar) + wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn)) + + endpoint = env.endpoints.create_start( + branch_name, + endpoint_id="ep-import_from_vanilla", + tenant_id=tenant, + config_lines=[ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime='5 s'", + ], + ) + assert endpoint.safe_psql("select count(*) from t") == [(1,)] + + conn = endpoint.connect() + cur = conn.cursor() + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + + # Advance nextXid close to 2 billion XIDs + while True: + xid = int(query_scalar(cur, "SELECT txid_current()")) + log.info(f"xid now {xid}") + # Consume 10k transactons at a time until we get to 2^31 - 200k + if xid < 2 * 1024 * 1024 * 1024 - 100000: + cur.execute("select test_consume_xids(50000);") + elif xid < 2 * 1024 * 1024 * 1024 - 10000: + cur.execute("select test_consume_xids(5000);") + else: + break + + # Run a bunch of real INSERTs to cross over the 2 billion mark + # Use a begin-exception block to have a separate sub-XID for each insert. + cur.execute( + """ + do $$ + begin + for i in 1..10000 loop + -- Use a begin-exception block to generate a new subtransaction on each iteration + begin + insert into t values (i); + exception when others then + raise 'not expected %', sqlerrm; + end; + end loop; + end; + $$; + """ + ) + # A checkpoint writes a WAL record with xl_xid=0. Many other WAL + # records would have the same effect. + cur.execute("checkpoint") + cur.execute("INSERT INTO t VALUES ('before fix')") + wait_for_wal_insert_lsn(env, endpoint, tenant, timeline) + + # Ok, the nextXid in the pageserver at this LSN should now be incorrectly + # set to 1:1024. Remember this LSN. + broken_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + + # Now fix the bug, and generate some WAL with XIDs + ps_http.configure_failpoints(("reintroduce-nextxid-update-bug", "off")) + cur.execute("INSERT INTO t VALUES ('after fix')") + fixed_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + wait_for_wal_insert_lsn(env, endpoint, tenant, timeline) + + log.info(f"nextXid was broken by {broken_lsn}, and fixed again by {fixed_lsn}") + + # Stop the original endpoint, we don't need it anymore. + endpoint.stop() + + # Test that we cannot start a new endpoint at the broken LSN. + env.neon_cli.create_branch( + "at-broken-lsn", branch_name, ancestor_start_lsn=broken_lsn, tenant_id=tenant + ) + with pytest.raises(RuntimeError, match="compute startup timed out; still in Init state"): + endpoint_broken = env.endpoints.create_start( + "at-broken-lsn", + endpoint_id="ep-at-broken-lsn", + tenant_id=tenant, + ) + log.error(f"starting endpoint at broken LSN succeeded unexpectedly") + + # But after the bug was fixed, the one-off hack fixed the timeline, + # and a later LSN works. + env.neon_cli.create_branch( + "at-fixed-lsn", branch_name, ancestor_start_lsn=fixed_lsn, tenant_id=tenant + ) + endpoint_fixed = env.endpoints.create_start( + "at-fixed-lsn", endpoint_id="ep-at-fixed-lsn", tenant_id=tenant + ) + + conn = endpoint_fixed.connect() + cur = conn.cursor() + cur.execute("SELECT count(*) from t") + assert cur.fetchone() == (10000 + 1,)