mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 09:52:54 +00:00
Retroactively fix the nextXid on a known broken timeline
This one particular timeline in production hit the nextXid bug. Add a one-off hack that will fix the nextXid on that particular timeline.
This commit is contained in:
committed by
Stas Kelvich
parent
734755eaca
commit
4ee11d9dfc
@@ -26,8 +26,11 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
||||
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use hex::FromHex;
|
||||
use tracing::*;
|
||||
use utils::failpoint_support;
|
||||
|
||||
@@ -44,10 +47,11 @@ use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
|
||||
use postgres_ffi::v14::xlog_utils::*;
|
||||
use postgres_ffi::v14::CheckPoint;
|
||||
use postgres_ffi::v14::{CheckPoint, bindings::FullTransactionId};
|
||||
use postgres_ffi::TransactionId;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
pub struct WalIngest {
|
||||
shard: ShardIdentity,
|
||||
@@ -109,6 +113,53 @@ impl WalIngest {
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
|
||||
// BEGIN ONE-OFF HACK
|
||||
//
|
||||
// We had a bug where we incorrectly passed 0 to update_next_xid(). That was
|
||||
// harmless as long as nextXid was < 2^31, because 0 looked like a very old
|
||||
// XID. But once nextXid reaches 2^31, 0 starts to look like a very new XID, and
|
||||
// we incorrectly bumped up nextXid to the next epoch, to value '1:1024'
|
||||
//
|
||||
// We have one known timeline in production where that happened. This is a one-off
|
||||
// fix to fix that damage. The last WAL record on that timeline as of this writing
|
||||
// is this:
|
||||
//
|
||||
// rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 35A/E32D86D8, prev 35A/E32D86B0, desc: RUNNING_XACTS nextXid 2325447052 latestCompletedXid 2325447051 oldestRunningXid 2325447052
|
||||
//
|
||||
// So on that particular timeline, before that LSN, fix the incorrectly set
|
||||
// nextXid to the nextXid value from that record, plus 1000 to give some safety
|
||||
// margin.
|
||||
|
||||
// For testing this hack, this failpoint temporarily re-introduces the bug that
|
||||
// was fixed
|
||||
fn reintroduce_bug_failpoint_activated() -> bool {
|
||||
fail::fail_point!("reintroduce-nextxid-update-bug", |_| { true });
|
||||
return false;
|
||||
}
|
||||
if decoded.xl_xid == pg_constants::INVALID_TRANSACTION_ID
|
||||
&& reintroduce_bug_failpoint_activated()
|
||||
&& self.checkpoint.update_next_xid(decoded.xl_xid)
|
||||
{
|
||||
info!("failpoint: Incorrectly updated nextXid at LSN {} to {}", lsn, self.checkpoint.nextXid.value);
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
|
||||
if self.checkpoint.nextXid.value == 4294968320 && // 1::1024, the incorrect value
|
||||
modification.tline.tenant_shard_id.tenant_id == TenantId::from_hex("df254570a4f603805528b46b0d45a76c").unwrap() &&
|
||||
modification.tline.timeline_id == TimelineId::from_hex("e592ec1cc0e5a41f69060d0c9efb07ed").unwrap() &&
|
||||
lsn < Lsn::from_str("35A/E32D9000").unwrap() &&
|
||||
!reintroduce_bug_failpoint_activated()
|
||||
{
|
||||
// This is the last nextXid value from the last RUNNING_XACTS record, at the
|
||||
// end of the WAL as of this writing.
|
||||
self.checkpoint.nextXid = FullTransactionId {
|
||||
value: 2325447052 + 1000
|
||||
};
|
||||
self.checkpoint_modified = true;
|
||||
warn!("nextXid fixed by one-off hack at LSN {}", lsn);
|
||||
}
|
||||
// END ONE-OFF HACK
|
||||
|
||||
match decoded.xl_rmid {
|
||||
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
|
||||
// Heap AM records need some special handling, because they modify VM pages
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
@@ -98,7 +99,7 @@ def test_import_at_2bil(
|
||||
vanilla_pg.safe_psql("CREATE TABLE t (t text);")
|
||||
vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')")
|
||||
|
||||
endpoint_id = "ep-import_from_vanilla"
|
||||
branch = "import_from_vanilla"
|
||||
tenant = TenantId.generate()
|
||||
timeline = TimelineId.generate()
|
||||
|
||||
@@ -138,7 +139,7 @@ def test_import_at_2bil(
|
||||
"--timeline-id",
|
||||
str(timeline),
|
||||
"--node-name",
|
||||
endpoint_id,
|
||||
branch_name,
|
||||
"--base-lsn",
|
||||
start_lsn,
|
||||
"--base-tarfile",
|
||||
@@ -157,7 +158,8 @@ def test_import_at_2bil(
|
||||
wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn))
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
endpoint_id,
|
||||
branch_name,
|
||||
endpoint_id="ep-import_from_vanilla",
|
||||
tenant_id=tenant,
|
||||
config_lines=[
|
||||
"log_autovacuum_min_duration = 0",
|
||||
@@ -166,7 +168,6 @@ def test_import_at_2bil(
|
||||
)
|
||||
assert endpoint.safe_psql("select count(*) from t") == [(1,)]
|
||||
|
||||
# Ok, consume
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
@@ -218,3 +219,204 @@ def test_import_at_2bil(
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) from t")
|
||||
assert cur.fetchone() == (10000 + 1,)
|
||||
|
||||
|
||||
# This is a followup to the test_import_at_2bil test.
|
||||
#
|
||||
# Use a failpoint to reintroduce the bug that test_import_at_2bil also
|
||||
# tests. Then, after the damage has been done, clear the failpoint to
|
||||
# fix the bug. Check that the one-off hack that we added for a particular
|
||||
# timeline that hit this in production fixes the broken timeline.
|
||||
def test_one_off_hack_for_nextxid_bug(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_distrib_dir: Path,
|
||||
pg_bin,
|
||||
vanilla_pg,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# We begin with the old bug still present, to create a broken timeline
|
||||
ps_http.configure_failpoints(("reintroduce-nextxid-update-bug", "return(true)"))
|
||||
|
||||
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
|
||||
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
|
||||
psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
|
||||
|
||||
# Reset the vanilla Postgres instance to somewhat before 2 billion transactions,
|
||||
# and around the same LSN as with the production timeline.
|
||||
pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
|
||||
cmd = [
|
||||
pg_resetwal_path,
|
||||
"--next-transaction-id=2129920000",
|
||||
"-l",
|
||||
"000000010000035A000000E0",
|
||||
"-D",
|
||||
str(vanilla_pg.pgdatadir),
|
||||
]
|
||||
pg_bin.run_capture(cmd, env=psql_env)
|
||||
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
|
||||
vanilla_pg.safe_psql(
|
||||
"""create table tt as select 'long string to consume some space' || g
|
||||
from generate_series(1,300000) g"""
|
||||
)
|
||||
assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)]
|
||||
vanilla_pg.safe_psql("CREATE TABLE t (t text);")
|
||||
vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')")
|
||||
|
||||
branch_name = "import_from_vanilla"
|
||||
# This is the tenant/timeline that the one-off hack targets
|
||||
tenant = "df254570a4f603805528b46b0d45a76c"
|
||||
timeline = "e592ec1cc0e5a41f69060d0c9efb07ed"
|
||||
|
||||
env.pageserver.tenant_create(tenant)
|
||||
|
||||
# Take basebackup
|
||||
basebackup_dir = os.path.join(test_output_dir, "basebackup")
|
||||
base_tar = os.path.join(basebackup_dir, "base.tar")
|
||||
wal_tar = os.path.join(basebackup_dir, "pg_wal.tar")
|
||||
os.mkdir(basebackup_dir)
|
||||
vanilla_pg.safe_psql("CHECKPOINT")
|
||||
pg_bin.run(
|
||||
[
|
||||
"pg_basebackup",
|
||||
"-F",
|
||||
"tar",
|
||||
"-d",
|
||||
vanilla_pg.connstr(),
|
||||
"-D",
|
||||
basebackup_dir,
|
||||
]
|
||||
)
|
||||
|
||||
# Get start_lsn and end_lsn
|
||||
with open(os.path.join(basebackup_dir, "backup_manifest")) as f:
|
||||
manifest = json.load(f)
|
||||
start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"]
|
||||
end_lsn = manifest["WAL-Ranges"][0]["End-LSN"]
|
||||
|
||||
def import_tar(base, wal):
|
||||
env.neon_cli.raw_cli(
|
||||
[
|
||||
"timeline",
|
||||
"import",
|
||||
"--tenant-id",
|
||||
str(tenant),
|
||||
"--timeline-id",
|
||||
str(timeline),
|
||||
"--node-name",
|
||||
branch_name,
|
||||
"--base-lsn",
|
||||
start_lsn,
|
||||
"--base-tarfile",
|
||||
base,
|
||||
"--end-lsn",
|
||||
end_lsn,
|
||||
"--wal-tarfile",
|
||||
wal,
|
||||
"--pg-version",
|
||||
env.pg_version,
|
||||
]
|
||||
)
|
||||
|
||||
# Importing correct backup works
|
||||
import_tar(base_tar, wal_tar)
|
||||
wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn))
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name,
|
||||
endpoint_id="ep-import_from_vanilla",
|
||||
tenant_id=tenant,
|
||||
config_lines=[
|
||||
"log_autovacuum_min_duration = 0",
|
||||
"autovacuum_naptime='5 s'",
|
||||
],
|
||||
)
|
||||
assert endpoint.safe_psql("select count(*) from t") == [(1,)]
|
||||
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
# Install extension containing function needed for test
|
||||
cur.execute("CREATE EXTENSION neon_test_utils")
|
||||
|
||||
# Advance nextXid close to 2 billion XIDs
|
||||
while True:
|
||||
xid = int(query_scalar(cur, "SELECT txid_current()"))
|
||||
log.info(f"xid now {xid}")
|
||||
# Consume 10k transactons at a time until we get to 2^31 - 200k
|
||||
if xid < 2 * 1024 * 1024 * 1024 - 100000:
|
||||
cur.execute("select test_consume_xids(50000);")
|
||||
elif xid < 2 * 1024 * 1024 * 1024 - 10000:
|
||||
cur.execute("select test_consume_xids(5000);")
|
||||
else:
|
||||
break
|
||||
|
||||
# Run a bunch of real INSERTs to cross over the 2 billion mark
|
||||
# Use a begin-exception block to have a separate sub-XID for each insert.
|
||||
cur.execute(
|
||||
"""
|
||||
do $$
|
||||
begin
|
||||
for i in 1..10000 loop
|
||||
-- Use a begin-exception block to generate a new subtransaction on each iteration
|
||||
begin
|
||||
insert into t values (i);
|
||||
exception when others then
|
||||
raise 'not expected %', sqlerrm;
|
||||
end;
|
||||
end loop;
|
||||
end;
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
# A checkpoint writes a WAL record with xl_xid=0. Many other WAL
|
||||
# records would have the same effect.
|
||||
cur.execute("checkpoint")
|
||||
cur.execute("INSERT INTO t VALUES ('before fix')")
|
||||
wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
|
||||
|
||||
# Ok, the nextXid in the pageserver at this LSN should now be incorrectly
|
||||
# set to 1:1024. Remember this LSN.
|
||||
broken_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
|
||||
|
||||
# Now fix the bug, and generate some WAL with XIDs
|
||||
ps_http.configure_failpoints(("reintroduce-nextxid-update-bug", "off"))
|
||||
cur.execute("INSERT INTO t VALUES ('after fix')")
|
||||
fixed_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
|
||||
wait_for_wal_insert_lsn(env, endpoint, tenant, timeline)
|
||||
|
||||
log.info(f"nextXid was broken by {broken_lsn}, and fixed again by {fixed_lsn}")
|
||||
|
||||
# Stop the original endpoint, we don't need it anymore.
|
||||
endpoint.stop()
|
||||
|
||||
# Test that we cannot start a new endpoint at the broken LSN.
|
||||
env.neon_cli.create_branch(
|
||||
"at-broken-lsn", branch_name, ancestor_start_lsn=broken_lsn, tenant_id=tenant
|
||||
)
|
||||
with pytest.raises(RuntimeError, match="compute startup timed out; still in Init state"):
|
||||
endpoint_broken = env.endpoints.create_start(
|
||||
"at-broken-lsn",
|
||||
endpoint_id="ep-at-broken-lsn",
|
||||
tenant_id=tenant,
|
||||
)
|
||||
log.error(f"starting endpoint at broken LSN succeeded unexpectedly")
|
||||
|
||||
# But after the bug was fixed, the one-off hack fixed the timeline,
|
||||
# and a later LSN works.
|
||||
env.neon_cli.create_branch(
|
||||
"at-fixed-lsn", branch_name, ancestor_start_lsn=fixed_lsn, tenant_id=tenant
|
||||
)
|
||||
endpoint_fixed = env.endpoints.create_start(
|
||||
"at-fixed-lsn", endpoint_id="ep-at-fixed-lsn", tenant_id=tenant
|
||||
)
|
||||
|
||||
conn = endpoint_fixed.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) from t")
|
||||
assert cur.fetchone() == (10000 + 1,)
|
||||
|
||||
Reference in New Issue
Block a user