From e4898a6e605e791a00ce21bf49d4cc0d9a10534a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 20 Jan 2024 18:04:16 +0200
Subject: [PATCH] Don't pass InvalidTransactionId to update_next_xid. (#6410)

update_next_xid() doesn't have any special treatment for the invalid or
other special XIDs, so it will treat InvalidTransactionId (0) as a
regular XID. If old nextXid is smaller than 2^31, 0 will look like a
very old XID, and nothing happens. But if nextXid is greater than 2^31 0
will look like a very new XID, and update_next_xid() will incorrectly
bump up nextXID.
---
 pageserver/src/walingest.rs          |  11 +-
 test_runner/regress/test_next_xid.py | 172 ++++++++++++++++++++++++++-
 test_runner/regress/test_vm_bits.py  |   3 +
 3 files changed, 181 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 26de229e4d..f2c35436db 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -102,7 +102,9 @@ impl WalIngest {
         buf.advance(decoded.main_data_offset);
 
         assert!(!self.checkpoint_modified);
-        if self.checkpoint.update_next_xid(decoded.xl_xid) {
+        if decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID
+            && self.checkpoint.update_next_xid(decoded.xl_xid)
+        {
             self.checkpoint_modified = true;
         }
 
@@ -330,8 +332,13 @@ impl WalIngest {
                         < 0
                     {
                         self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-                        self.checkpoint_modified = true;
                     }
+
+                    // Write a new checkpoint key-value pair on every checkpoint record, even
+                    // if nothing really changed. Not strictly required, but it seems nice to
+                    // have some trace of the checkpoint records in the layer files at the same
+                    // LSNs.
+                    self.checkpoint_modified = true;
                 }
             }
             pg_constants::RM_LOGICALMSG_ID => {
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index 6e94e15227..da2580dbf9 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -1,10 +1,18 @@
+import json
+import os
 import time
+from pathlib import Path
 
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+)
+from fixtures.remote_storage import RemoteStorageKind
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import query_scalar
 
 
-# Test restarting page server, while safekeeper and compute node keep
-# running.
 def test_next_xid(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
@@ -52,3 +60,161 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
     cur = conn.cursor()
     cur.execute("SELECT count(*) FROM t")
     assert cur.fetchone() == (iterations,)
+
+
+# Test for a bug we had, where nextXid was incorrectly updated when the
+# XID counter reached 2 billion. The nextXid tracking logic incorrectly
+# treated 0 (InvalidTransactionId) as a regular XID, and after reaching
+# 2 billion, it started to look like a very new XID, which caused nextXid
+# to be immediately advanced to the next epoch.
+#
+def test_import_at_2bil(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_distrib_dir: Path,
+    pg_bin,
+    vanilla_pg,
+):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
+    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
+    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
+
+    # Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
+    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
+    cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)]
+    pg_bin.run_capture(cmd, env=psql_env)
+
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+    vanilla_pg.safe_psql(
+        """create table tt as select 'long string to consume some space' || g
+     from generate_series(1,300000) g"""
+    )
+    assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)]
+    vanilla_pg.safe_psql("CREATE TABLE t (t text);")
+    vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')")
+
+    endpoint_id = "ep-import_from_vanilla"
+    tenant = TenantId.generate()
+    timeline = TimelineId.generate()
+
+    env.pageserver.tenant_create(tenant)
+
+    # Take basebackup
+    basebackup_dir = os.path.join(test_output_dir, "basebackup")
+    base_tar = os.path.join(basebackup_dir, "base.tar")
+    wal_tar = os.path.join(basebackup_dir, "pg_wal.tar")
+    os.mkdir(basebackup_dir)
+    vanilla_pg.safe_psql("CHECKPOINT")
+    pg_bin.run(
+        [
+            "pg_basebackup",
+            "-F",
+            "tar",
+            "-d",
+            vanilla_pg.connstr(),
+            "-D",
+            basebackup_dir,
+        ]
+    )
+
+    # Get start_lsn and end_lsn
+    with open(os.path.join(basebackup_dir, "backup_manifest")) as f:
+        manifest = json.load(f)
+        start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
+        end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
+
+    def import_tar(base, wal):
+        env.neon_cli.raw_cli(
+            [
+                "timeline",
+                "import",
+                "--tenant-id",
+                str(tenant),
+                "--timeline-id",
+                str(timeline),
+                "--node-name",
+                endpoint_id,
+                "--base-lsn",
+                start_lsn,
+                "--base-tarfile",
+                base,
+                "--end-lsn",
+                end_lsn,
+                "--wal-tarfile",
+                wal,
+                "--pg-version",
+                env.pg_version,
+            ]
+        )
+
+    # Importing correct backup works
+    import_tar(base_tar, wal_tar)
+    wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn))
+
+    endpoint = env.endpoints.create_start(
+        endpoint_id,
+        tenant_id=tenant,
+        config_lines=[
+            "log_autovacuum_min_duration = 0",
+            "autovacuum_naptime='5 s'",
+        ],
+    )
+    assert endpoint.safe_psql("select count(*) from t") == [(1,)]
+
+    # Ok, consume
+    conn = endpoint.connect()
+    cur = conn.cursor()
+
+    # Install extension containing function needed for test
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Advance nextXid close to 2 billion XIDs
+    while True:
+        xid = int(query_scalar(cur, "SELECT txid_current()"))
+        log.info(f"xid now {xid}")
+        # Consume 10k transactons at a time until we get to 2^31 - 200k
+        if xid < 2 * 1024 * 1024 * 1024 - 100000:
+            cur.execute("select test_consume_xids(50000);")
+        elif xid < 2 * 1024 * 1024 * 1024 - 10000:
+            cur.execute("select test_consume_xids(5000);")
+        else:
+            break
+
+    # Run a bunch of real INSERTs to cross over the 2 billion mark
+    # Use a begin-exception block to have a separate sub-XID for each insert.
+    cur.execute(
+        """
+        do $$
+        begin
+          for i in 1..10000 loop
+            -- Use a begin-exception block to generate a new subtransaction on each iteration
+            begin
+              insert into t values (i);
+            exception when others then
+              raise 'not expected %', sqlerrm;
+            end;
+          end loop;
+        end;
+        $$;
+        """
+    )
+    # A checkpoint writes a WAL record with xl_xid=0. Many other WAL
+    # records would have the same effect.
+    cur.execute("checkpoint")
+
+    # wait until pageserver receives that data
+    wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
+
+    # Restart endpoint
+    endpoint.stop()
+    endpoint.start()
+
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("SELECT count(*) from t")
+    assert cur.fetchone() == (10000 + 1,)
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index bc810ceb09..415f086bd3 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -1,3 +1,4 @@
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
 
@@ -117,6 +118,8 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
 # Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
 # record.
 #
+# FIXME: This test is broken
+@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/6412#issuecomment-1902072541")
 def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
     env = neon_simple_env