Compare commits

...

38 Commits

Author SHA1 Message Date
Konstantin Knizhnik
b4973ecdfa Add 'failed get control bytes' to list of expected errors 2024-12-05 16:24:31 +02:00
Konstantin Knizhnik
27403e6c3b Add 'failed to get checkpoint bytes' to list of expected errors 2024-12-05 15:32:02 +02:00
Konstantin Knizhnik
2c10e8a966 RemoveRemove unnecessary reset of redo field 2024-12-05 14:28:34 +02:00
Heikki Linnakangas
0128948d8d Abuse Checkpoint.redo field to distinguish shutdown checkpoints 2024-12-05 10:04:51 +02:00
Konstantin Knizhnik
7a92454fbe Reset checkpoiunt redo field after storing it to correctly detect normal shutdown 2024-12-05 09:04:59 +02:00
Konstantin Knizhnik
7fed72e951 Fix compile warnings 2024-12-04 22:03:51 +02:00
Konstantin Knizhnik
19510f3315 Remove calculate_walrecord_end_lsn function 2024-12-04 17:47:49 +02:00
Konstantin Knizhnik
13dd2f11c6 Use redo field to distinguish shutdown andonine checkpoints 2024-12-04 15:33:42 +02:00
Konstantin Knizhnik
4dbb469539 Remove AuxFileStore from test_pgstat.py 2024-12-04 10:19:30 +02:00
Konstantin Knizhnik
3114a9f992 Use new version of test_wal_recevier test 2024-12-04 09:51:48 +02:00
Konstantin Knizhnik
668845fcb7 Yet another attempt to make test_wal_receiver test pass 2024-12-04 09:50:09 +02:00
Konstantin Knizhnik
0f63037345 Increase timeout to 10 seconds 2024-12-04 09:50:09 +02:00
Konstantin Knizhnik
dd826c4c81 Increase wait LSN timeout to 2 seconds 2024-12-04 09:50:09 +02:00
Konstantin Knizhnik
8782018438 Add SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT 2024-12-04 09:50:09 +02:00
Konstantin Knizhnik
f4196983d2 Undo changes test_import.py 2024-12-04 09:50:09 +02:00
Konstantin Knizhnik
293f22056f Undo debug trace 2024-12-04 09:50:09 +02:00
Konstantin Knizhnik
cfcb197d85 Add calculate_walrecord_end_lsn function 2024-12-04 09:50:08 +02:00
Konstantin Knizhnik
fa3dc91fb5 Fix is_aux_file_key 2024-12-04 09:50:07 +02:00
Konstantin Knizhnik
0841fb9b7b Do not import AUX files 2024-12-04 09:48:55 +02:00
Konstantin Knizhnik
b5802abab9 Disable importing AX files 2024-12-04 09:44:49 +02:00
Konstantin Knizhnik
5d12e5a72d Update pageserver/src/basebackup.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2024-12-04 09:44:49 +02:00
Konstantin Knizhnik
3179b9bed2 Add test that pgstat information is dropped in case of abnormal termination 2024-12-04 09:44:49 +02:00
Konstantin Knizhnik
e48a8415ff Add comments 2024-12-04 09:44:49 +02:00
Konstantin Knizhnik
fc041a213b Update test_runner/regress/test_pgstat.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2024-12-04 09:44:48 +02:00
Konstantin Knizhnik
3890f5846c Update pageserver/src/basebackup.rs
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2024-12-04 09:44:48 +02:00
Konstantin Knizhnik
585672f5fc Update test_runner/regress/test_pgstat.py
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2024-12-04 09:44:48 +02:00
Konstantin Knizhnik
92ae912ebe Handle lack of checkpoint in basebackup 2024-12-04 09:44:48 +02:00
Konstantin Knizhnik
953ad21104 Make clippy happy 2024-12-04 09:44:27 +02:00
Konstantin Knizhnik
0fcf20075a Drop pg_stat data in case of abnormal Postgres termination 2024-12-04 09:43:57 +02:00
Konstantin Knizhnik
c4affc7859 Add neon_pgstat_file_size_limit GUC to limit size of AUX file for pg_stat 2024-12-04 09:43:55 +02:00
Konstantin Knizhnik
368fc2c6c3 Do not import AUX files while initializing fresh timeline from pgdatadir 2024-12-04 09:40:47 +02:00
Konstantin Knizhnik
dfed7029e1 Make clippy happy 2024-12-04 09:40:47 +02:00
Konstantin Knizhnik
6c0e44be4c Fix bug in AUX v1 delta optimization 2024-12-04 09:40:47 +02:00
Konstantin Knizhnik
dc780c9e1e Fix bug in including AUX files in basebacklup 2024-12-04 09:40:47 +02:00
Konstantin Knizhnik
45dcc66bf4 Fix python fiormating 2024-12-04 09:40:46 +02:00
Konstantin Knizhnik
d91c5c3523 Add AUX_DIR_PG_STAT 2024-12-04 09:40:46 +02:00
Konstantin Knizhnik
b79e2062bd Add test for restoring pgstat file 2024-12-04 09:40:46 +02:00
Konstantin Knizhnik
ffc24d6d8d Persist pg_stat informartion in PS 2024-12-04 09:40:44 +02:00
16 changed files with 221 additions and 39 deletions

View File

@@ -278,7 +278,7 @@ pub fn generate_pg_control(
checkpoint_bytes: &[u8],
lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<(Bytes, u64)> {
) -> anyhow::Result<(Bytes, u64, bool)> {
dispatch_pgversion!(
pg_version,
pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),

View File

@@ -124,23 +124,64 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
}
}
/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN
///
/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in
/// the pageserver. They use the same format as the PostgreSQL control file and the
/// checkpoint record, but see walingest.rs for how exactly they are kept up to date.
/// 'lsn' is the LSN at which we're starting up.
///
/// Returns:
/// - pg_control file contents
/// - system_identifier, extracted from the persisted information
/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown
/// checkpoint at the given LSN
pub fn generate_pg_control(
pg_control_bytes: &[u8],
checkpoint_bytes: &[u8],
lsn: Lsn,
) -> anyhow::Result<(Bytes, u64)> {
) -> anyhow::Result<(Bytes, u64, bool)> {
let mut pg_control = ControlFileData::decode(pg_control_bytes)?;
let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?;
let was_shutdown;
// Generate new pg_control needed for bootstrap
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
//
// NB: In the checkpoint struct that we persist in the pageserver, we have a different
// convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint,
// 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to
// the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0.
//
// We didn't always have this convention however, and old persisted records will have
// old REDO values that point to some old LSN.
//
// The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown
// checkpoint record at that point in WAL, with no new WAL records after it. That case
// can be treated as starting from a clean shutdown. All other cases are treated as
// non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so
// that distinction doesn't matter very much. As of this writing, it only affects
// whether the persisted pg_stats information can be used or not.
//
// In the Checkpoint struct in the returned pg_control file, the redo pointer is
// always set to the LSN we're starting at, to hint that no WAL replay is required.
// (There's some neon-specific code in Postgres startup to make that work, though.
// Just setting the redo pointer is not sufficient.)
if Lsn(checkpoint.redo) == lsn {
was_shutdown = true;
} else {
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
was_shutdown = false;
}
//save new values in pg_control
// We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown. The
// neon-specific code at postgres startup ignores the state stored in the control
// file, similar to archive recovery in standalone PostgreSQL. Similarly, the
// checkPoint pointer is ignored, so just set it to 0.
pg_control.checkPoint = 0;
pg_control.checkPointCopy = checkpoint;
pg_control.state = DBState_DB_SHUTDOWNED;
Ok((pg_control.encode(), pg_control.system_identifier))
Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown))
}
pub fn get_current_timestamp() -> TimestampTz {

View File

@@ -345,6 +345,7 @@ impl AuxFileV2 {
AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
}
(2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
(3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash),
(1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
(0xff, 0xff) => AuxFileV2::Other(hash),
_ => return None,

View File

@@ -39,6 +39,7 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key
const AUX_DIR_PG_LOGICAL: u8 = 0x01;
const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
const AUX_DIR_PG_STAT: u8 = 0x03;
const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
/// Encode the aux file into a fixed-size key.
@@ -53,6 +54,7 @@ const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
/// * pg_logical/replorigin_checkpoint -> 0x0103
/// * pg_logical/others -> 0x01FF
/// * pg_replslot/ -> 0x0201
/// * pg_stat/pgstat.stat -> 0x0301
/// * others -> 0xFFFF
///
/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
@@ -75,6 +77,8 @@ pub fn encode_aux_file_key(path: &str) -> Key {
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
} else if let Some(fname) = path.strip_prefix("pg_replslot/") {
aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
} else if let Some(fname) = path.strip_prefix("pg_stat/pgstat.stat") {
aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes())
} else {
if cfg!(debug_assertions) {
warn!(

View File

@@ -255,6 +255,31 @@ where
async fn send_tarball(mut self) -> Result<(), BasebackupError> {
// TODO include checksum
// Construct the pg_control file from the persisted checkpoint and pg_control
// information. But we only add this to the tarball at the end, so that if the
// writing is interrupted half-way through, the resulting incomplete tarball will
// be missing the pg_control file, which prevents PostgreSQL from starting up on
// it. With proper error handling, you should never try to start up from an
// incomplete basebackup in the first place, of course, but this is a nice little
// extra safety measure.
let checkpoint_bytes = self
.timeline
.get_checkpoint(self.lsn, self.ctx)
.await
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = self
.timeline
.get_control_file(self.lsn, self.ctx)
.await
.context("failed get control bytes")?;
let (pg_control_bytes, system_identifier, was_shutdown) =
postgres_ffi::generate_pg_control(
&pg_control_bytes,
&checkpoint_bytes,
self.lsn,
self.timeline.pg_version,
)?;
let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
let pgversion = self.timeline.pg_version;
@@ -392,6 +417,10 @@ where
// In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
// but now we should handle (skip) it for backward compatibility.
continue;
} else if path == "pg_stat/pgstat.stat" && !was_shutdown {
// Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN
// of a shutdown checkpoint.
continue;
}
let header = new_tar_header(&path, content.len() as u64)?;
self.ar
@@ -453,8 +482,9 @@ where
)))
});
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file().await?;
// Last, add the pg_control file and bootstrap WAL segment.
self.add_pgcontrol_file(pg_control_bytes, system_identifier)
.await?;
self.ar.finish().await.map_err(BasebackupError::Client)?;
debug!("all tarred up!");
Ok(())
@@ -657,7 +687,11 @@ where
// Add generated pg_control file and bootstrap WAL segment.
// Also send zenith.signal file with extra bootstrap data.
//
async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
async fn add_pgcontrol_file(
&mut self,
pg_control_bytes: Bytes,
system_identifier: u64,
) -> Result<(), BasebackupError> {
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
@@ -680,24 +714,6 @@ where
.await
.map_err(BasebackupError::Client)?;
let checkpoint_bytes = self
.timeline
.get_checkpoint(self.lsn, self.ctx)
.await
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = self
.timeline
.get_control_file(self.lsn, self.ctx)
.await
.context("failed get control bytes")?;
let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
&pg_control_bytes,
&checkpoint_bytes,
self.lsn,
self.timeline.pg_version,
)?;
//send pg_control
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
self.ar

View File

@@ -52,8 +52,8 @@ use camino::{Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::DBDIR_KEY;
use pageserver_api::key::{Key, KEY_SIZE};
use pageserver_api::key::{AUX_FILES_KEY, DBDIR_KEY};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
@@ -969,7 +969,11 @@ impl DeltaLayerInner {
.as_slice()
.iter()
.filter_map(|(_, blob_meta)| {
if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
if blob_meta.key.is_rel_dir_key()
|| blob_meta.key == DBDIR_KEY
|| blob_meta.key == AUX_FILES_KEY
|| blob_meta.key.is_aux_file_key()
{
// The size of values for these keys is unbounded and can
// grow very large in pathological cases.
None

View File

@@ -49,8 +49,8 @@ use camino::{Utf8Path, Utf8PathBuf};
use hex;
use itertools::Itertools;
use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::DBDIR_KEY;
use pageserver_api::key::{Key, KEY_SIZE};
use pageserver_api::key::{AUX_FILES_KEY, DBDIR_KEY};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use pageserver_api::value::Value;
@@ -591,7 +591,11 @@ impl ImageLayerInner {
.as_slice()
.iter()
.filter_map(|(_, blob_meta)| {
if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
if blob_meta.key.is_rel_dir_key()
|| blob_meta.key == DBDIR_KEY
|| blob_meta.key == AUX_FILES_KEY
|| blob_meta.key.is_aux_file_key()
{
// The size of values for these keys is unbounded and can
// grow very large in pathological cases.
None

View File

@@ -4239,10 +4239,12 @@ impl Timeline {
// Normal path: we have written some data into the new image layer for this
// partition, so flush it to disk.
let (desc, path) = image_layer_writer.finish(ctx).await?;
let file_size = desc.file_size;
let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
info!(
"created image layer for metadata {}",
image_layer.local_path()
"created image layer for metadata {} size {}",
image_layer.local_path(),
file_size,
);
Ok(ImageLayerCreationOutcome {
image: Some(image_layer),

View File

@@ -1187,6 +1187,50 @@ impl WalIngest {
} else {
cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
}
// NB: We abuse the Checkpoint.redo field:
//
// - In PostgreSQL, the Checkpoint struct doesn't store the information
// of whether this is an online checkpoint or a shutdown checkpoint. It's
// stored in the XLOG info field of the WAL record, shutdown checkpoints
// use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use
// XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers
// in the pageserver, however.
//
// - In PostgreSQL, the Checkpoint.redo field stores the *start* of the
// checkpoint record, if it's a shutdown checkpoint. But when we are
// starting from a shutdown checkpoint, the basebackup LSN is the *end*
// of the shutdown checkpoint WAL record. That makes it difficult to
// correctly detect whether we're starting from a shutdown record or
// not.
//
// To address both of those issues, we store 0 in the redo field if it's
// an online checkpoint record, and the record's *end* LSN if it's a
// shutdown checkpoint. We don't need the original redo pointer in neon,
// because we don't perform WAL replay at startup anyway, so we can get
// away with abusing the redo field like this.
//
// XXX: Ideally, we would persist the extra information in a more
// explicit format, rather than repurpose the fields of the Postgres
// struct like this. However, we already have persisted data like this,
// so we need to maintain backwards compatibility.
//
// NB: We didn't originally have this convention, so there are still old
// persisted records that didn't do this. Before, we didn't update the
// persisted redo field at all. That means that old records have a bogus
// redo pointer that points to some old value, from the checkpoint record
// that was originally imported from the data directory. If it was a
// project created in Neon, that means it points to the first checkpoint
// after initdb. That's OK for our purposes: all such old checkpoints are
// treated as old online checkpoints when the basebackup is created.
cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN {
// Store the *end* LSN of the checkpoint record. Or to be precise,
// the start LSN of the *next* record, i.e. if the record ends
// exactly at page boundary, the redo LSN points to just after the
// page header on the next page.
lsn.into()
} else {
Lsn::INVALID.into()
};
// Write a new checkpoint key-value pair on every checkpoint record, even
// if nothing really changed. Not strictly required, but it seems nice to

View File

@@ -29,6 +29,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
".*failed to load metadata.*",
".*load failed.*load local timeline.*",
".*: layer load failed, assuming permanent failure:.*",
".*failed to get checkpoint bytes.*",
".*failed get control bytes.*",
]
)
@@ -75,7 +77,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
# (We don't check layer file contents on startup, when loading the timeline)
#
# This will change when we implement checksums for layers
with pytest.raises(Exception, match="get_values_reconstruct_data for layer ") as err:
with pytest.raises(Exception, match="failed to get checkpoint bytes") as err:
pg1.start()
log.info(
f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"

View File

@@ -0,0 +1,62 @@
import pytest
from fixtures.neon_fixtures import NeonEnv
from fixtures.pg_version import PgVersion
#
# Test that pgstat statistic is preserved across sessions
#
def test_pgstat(neon_simple_env: NeonEnv):
env = neon_simple_env
if env.pg_version == PgVersion.V14:
pytest.skip("PG14 doesn't support pgstat statistic persistence")
env.pageserver.allowed_errors.append(".*this timeline is using deprecated aux file policy V1.*")
n = 10000
endpoint = env.endpoints.create_start("main")
con = endpoint.connect()
cur = con.cursor()
cur.execute("create table t(x integer)")
cur.execute(f"insert into t values (generate_series(1,{n}))")
cur.execute("vacuum analyze t")
cur.execute("select sum(x) from t")
cur.execute("update t set x=x+1")
cur.execute("select pg_stat_force_next_flush()")
cur.execute(
"select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
)
rec = cur.fetchall()[0]
assert rec == (2, n * 2, n, n, n * 2, n, 1, 1)
endpoint.stop()
endpoint.start()
con = endpoint.connect()
cur = con.cursor()
cur.execute(
"select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
)
rec = cur.fetchall()[0]
assert rec == (2, n * 2, n, n, n * 2, n, 1, 1)
cur.execute("update t set x=x+1")
# stop without checkpoint
endpoint.stop(mode="immediate")
endpoint.start()
con = endpoint.connect()
cur = con.cursor()
cur.execute(
"select seq_scan,seq_tup_read,n_tup_ins,n_tup_upd,n_live_tup,n_dead_tup, vacuum_count,analyze_count from pg_stat_user_tables"
)
rec = cur.fetchall()[0]
# pgstat information should be discarded in case of abnormal termination
assert rec == (0, 0, 0, 0, 0, 0, 0, 0)

View File

@@ -790,6 +790,8 @@ def test_timeline_retain_lsn(
[
".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*",
".*page_service_conn_main.*could not find data for key.*",
".*failed to get checkpoint bytes.*",
".*failed get control bytes.*",
]
)
if offload_child is None or "no-restart" not in offload_child:

View File

@@ -1,15 +1,15 @@
{
"v17": [
"17.2",
"a10d95be67265e0f10a422ba0457f5a7af01de71"
"7864df7b68fa7e0a7b0234b2e5dd2cdb7772aa08"
],
"v16": [
"16.6",
"dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
"b6b298e88848f0dbb7d4a077fe70bcd4573ee7ca"
],
"v15": [
"15.10",
"972e325e62b455957adbbdd8580e31275bb5b8c9"
"b352942e9c08e5a5350f5c1662c118ce96ea11c5"
],
"v14": [
"14.15",