Retroactively fix the nextXid on a known broken timeline

This one particular timeline in production hit the nextXid bug. Add a one-off hack that will fix the nextXid on that particular timeline.
2026-01-16 09:52:54 +00:00 · 2024-01-20 17:58:35 +02:00
parent 734755eaca
commit 4ee11d9dfc
2 changed files with 258 additions and 5 deletions
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -26,8 +26,11 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

+use std::str::FromStr;
+
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
+use hex::FromHex;
 use tracing::*;
 use utils::failpoint_support;

@@ -44,10 +47,11 @@ use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::v14::xlog_utils::*;
-use postgres_ffi::v14::CheckPoint;
+use postgres_ffi::v14::{CheckPoint, bindings::FullTransactionId};
 use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;
+use utils::id::{TenantId, TimelineId};

 pub struct WalIngest {
    shard: ShardIdentity,
@@ -109,6 +113,53 @@ impl WalIngest {
            self.checkpoint_modified = true;
        }

+        // BEGIN ONE-OFF HACK
+        //
+        // We had a bug where we incorrectly passed 0 to update_next_xid(). That was
+        // harmless as long as nextXid was < 2^31, because 0 looked like a very old
+        // XID. But once nextXid reaches 2^31, 0 starts to look like a very new XID, and
+        // we incorrectly bumped up nextXid to the next epoch, to value '1:1024'
+        //
+        // We have one known timeline in production where that happened. This is a one-off
+        // fix to fix that damage. The last WAL record on that timeline as of this writing
+        // is this:
+        //
+        // rmgr: Standby     len (rec/tot):     50/    50, tx:          0, lsn: 35A/E32D86D8, prev 35A/E32D86B0, desc: RUNNING_XACTS nextXid 2325447052 latestCompletedXid 2325447051 oldestRunningXid 2325447052
+        //
+        // So on that particular timeline, before that LSN, fix the incorrectly set
+        // nextXid to the nextXid value from that record, plus 1000 to give some safety
+        // margin.
+
+        // For testing this hack, this failpoint temporarily re-introduces the bug that
+        // was fixed
+        fn reintroduce_bug_failpoint_activated() -> bool {
+            fail::fail_point!("reintroduce-nextxid-update-bug", |_| { true });
+            return false;
+        }
+        if decoded.xl_xid == pg_constants::INVALID_TRANSACTION_ID
+            && reintroduce_bug_failpoint_activated()
+            && self.checkpoint.update_next_xid(decoded.xl_xid)
+        {
+            info!("failpoint: Incorrectly updated nextXid at LSN {} to {}", lsn, self.checkpoint.nextXid.value);
+            self.checkpoint_modified = true;
+        }
+
+        if self.checkpoint.nextXid.value == 4294968320 && // 1::1024, the incorrect value
+            modification.tline.tenant_shard_id.tenant_id == TenantId::from_hex("df254570a4f603805528b46b0d45a76c").unwrap() &&
+            modification.tline.timeline_id == TimelineId::from_hex("e592ec1cc0e5a41f69060d0c9efb07ed").unwrap() &&
+            lsn < Lsn::from_str("35A/E32D9000").unwrap() &&
+            !reintroduce_bug_failpoint_activated()
+        {
+            // This is the last nextXid value from the last RUNNING_XACTS record, at the
+            // end of the WAL as of this writing.
+            self.checkpoint.nextXid = FullTransactionId {
+                value: 2325447052 + 1000
+            };
+            self.checkpoint_modified = true;
+            warn!("nextXid fixed by one-off hack at LSN {}", lsn);
+        }
+        // END ONE-OFF HACK
+
        match decoded.xl_rmid {
            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                // Heap AM records need some special handling, because they modify VM pages
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -1,5 +1,6 @@
 import json
 import os
+import pytest
 import time
 from pathlib import Path

@@ -98,7 +99,7 @@ def test_import_at_2bil(
    vanilla_pg.safe_psql("CREATE TABLE t (t text);")
    vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')")

-    endpoint_id = "ep-import_from_vanilla"
+    branch = "import_from_vanilla"
    tenant = TenantId.generate()
    timeline = TimelineId.generate()

@@ -138,7 +139,7 @@ def test_import_at_2bil(
                "--timeline-id",
                str(timeline),
                "--node-name",
-                endpoint_id,
+                branch_name,
                "--base-lsn",
                start_lsn,
                "--base-tarfile",
@@ -157,7 +158,8 @@ def test_import_at_2bil(
    wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn))

    endpoint = env.endpoints.create_start(
-        endpoint_id,
+        branch_name,
+        endpoint_id="ep-import_from_vanilla",
        tenant_id=tenant,
        config_lines=[
            "log_autovacuum_min_duration = 0",
@@ -166,7 +168,6 @@ def test_import_at_2bil(
    )
    assert endpoint.safe_psql("select count(*) from t") == [(1,)]

-    # Ok, consume
    conn = endpoint.connect()
    cur = conn.cursor()

@@ -218,3 +219,204 @@ def test_import_at_2bil(
    cur = conn.cursor()
    cur.execute("SELECT count(*) from t")
    assert cur.fetchone() == (10000 + 1,)
+
+
+# This is a followup to the test_import_at_2bil test.
+#
+# Use a failpoint to reintroduce the bug that test_import_at_2bil also
+# tests. Then, after the damage has been done, clear the failpoint to
+# fix the bug. Check that the one-off hack that we added for a particular
+# timeline that hit this in production fixes the broken timeline.
+def test_one_off_hack_for_nextxid_bug(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_distrib_dir: Path,
+    pg_bin,
+    vanilla_pg,
+):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # We begin with the old bug still present, to create a broken timeline
+    ps_http.configure_failpoints(("reintroduce-nextxid-update-bug", "return(true)"))
+
+    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
+    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
+    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
+
+    # Reset the vanilla Postgres instance to somewhat before 2 billion transactions,
+    # and around the same LSN as with the production timeline.
+    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
+    cmd = [
+        pg_resetwal_path,
+        "--next-transaction-id=2129920000",
+        "-l",
+        "000000010000035A000000E0",
+        "-D",
+        str(vanilla_pg.pgdatadir),
+    ]
+    pg_bin.run_capture(cmd, env=psql_env)
+
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+    vanilla_pg.safe_psql(
+        """create table tt as select 'long string to consume some space' || g
+     from generate_series(1,300000) g"""
+    )
+    assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)]
+    vanilla_pg.safe_psql("CREATE TABLE t (t text);")
+    vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')")
+
+    branch_name = "import_from_vanilla"
+    # This is the tenant/timeline that the one-off hack targets
+    tenant = "df254570a4f603805528b46b0d45a76c"
+    timeline = "e592ec1cc0e5a41f69060d0c9efb07ed"
+
+    env.pageserver.tenant_create(tenant)
+
+    # Take basebackup
+    basebackup_dir = os.path.join(test_output_dir, "basebackup")
+    base_tar = os.path.join(basebackup_dir, "base.tar")
+    wal_tar = os.path.join(basebackup_dir, "pg_wal.tar")
+    os.mkdir(basebackup_dir)
+    vanilla_pg.safe_psql("CHECKPOINT")
+    pg_bin.run(
+        [
+            "pg_basebackup",
+            "-F",
+            "tar",
+            "-d",
+            vanilla_pg.connstr(),
+            "-D",
+            basebackup_dir,
+        ]
+    )
+
+    # Get start_lsn and end_lsn
+    with open(os.path.join(basebackup_dir, "backup_manifest")) as f:
+        manifest = json.load(f)
+        start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
+        end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
+
+    def import_tar(base, wal):
+        env.neon_cli.raw_cli(
+            [
+                "timeline",
+                "import",
+                "--tenant-id",
+                str(tenant),
+                "--timeline-id",
+                str(timeline),
+                "--node-name",
+                branch_name,
+                "--base-lsn",
+                start_lsn,
+                "--base-tarfile",
+                base,
+                "--end-lsn",
+                end_lsn,
+                "--wal-tarfile",
+                wal,
+                "--pg-version",
+                env.pg_version,
+            ]
+        )
+
+    # Importing correct backup works
+    import_tar(base_tar, wal_tar)
+    wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn))
+
+    endpoint = env.endpoints.create_start(
+        branch_name,
+        endpoint_id="ep-import_from_vanilla",
+        tenant_id=tenant,
+        config_lines=[
+            "log_autovacuum_min_duration = 0",
+            "autovacuum_naptime='5 s'",
+        ],
+    )
+    assert endpoint.safe_psql("select count(*) from t") == [(1,)]
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    # Install extension containing function needed for test
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Advance nextXid close to 2 billion XIDs
+    while True:
+        xid = int(query_scalar(cur, "SELECT txid_current()"))
+        log.info(f"xid now {xid}")
+        # Consume 10k transactons at a time until we get to 2^31 - 200k
+        if xid < 2 * 1024 * 1024 * 1024 - 100000:
+            cur.execute("select test_consume_xids(50000);")
+        elif xid < 2 * 1024 * 1024 * 1024 - 10000:
+            cur.execute("select test_consume_xids(5000);")
+        else:
+            break
+
+    # Run a bunch of real INSERTs to cross over the 2 billion mark
+    # Use a begin-exception block to have a separate sub-XID for each insert.
+    cur.execute(
+        """
+        do $$
+        begin
+          for i in 1..10000 loop
+            -- Use a begin-exception block to generate a new subtransaction on each iteration
+            begin
+              insert into t values (i);
+            exception when others then
+              raise 'not expected %', sqlerrm;
+            end;
+          end loop;
+        end;
+        $$;
+        """
+    )
+    # A checkpoint writes a WAL record with xl_xid=0. Many other WAL
+    # records would have the same effect.
+    cur.execute("checkpoint")
+    cur.execute("INSERT INTO t VALUES ('before fix')")
+    wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
+
+    # Ok, the nextXid in the pageserver at this LSN should now be incorrectly
+    # set to 1:1024. Remember this LSN.
+    broken_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
+
+    # Now fix the bug, and generate some WAL with XIDs
+    ps_http.configure_failpoints(("reintroduce-nextxid-update-bug", "off"))
+    cur.execute("INSERT INTO t VALUES ('after fix')")
+    fixed_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
+    wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
+
+    log.info(f"nextXid was broken by {broken_lsn}, and fixed again by {fixed_lsn}")
+
+    # Stop the original endpoint, we don't need it anymore.
+    endpoint.stop()
+
+    # Test that we cannot start a new endpoint at the broken LSN.
+    env.neon_cli.create_branch(
+        "at-broken-lsn", branch_name, ancestor_start_lsn=broken_lsn, tenant_id=tenant
+    )
+    with pytest.raises(RuntimeError, match="compute startup timed out; still in Init state"):
+        endpoint_broken = env.endpoints.create_start(
+            "at-broken-lsn",
+            endpoint_id="ep-at-broken-lsn",
+            tenant_id=tenant,
+        )
+        log.error(f"starting endpoint at broken LSN succeeded unexpectedly")
+
+    # But after the bug was fixed, the one-off hack fixed the timeline,
+    # and a later LSN works.
+    env.neon_cli.create_branch(
+        "at-fixed-lsn", branch_name, ancestor_start_lsn=fixed_lsn, tenant_id=tenant
+    )
+    endpoint_fixed = env.endpoints.create_start(
+        "at-fixed-lsn", endpoint_id="ep-at-fixed-lsn", tenant_id=tenant
+    )
+
+    conn = endpoint_fixed.connect()
+    cur = conn.cursor()
+    cur.execute("SELECT count(*) from t")
+    assert cur.fetchone() == (10000 + 1,)