Use last record LSN as flush position reported by safekeepers to walproposer to prevent moving VCL backward on compute node restart

This commit is contained in:
Konstantin Knizhnik
2021-08-26 18:08:29 +03:00
parent a2e135b404
commit 815528e0ce
10 changed files with 267 additions and 213 deletions

View File

@@ -27,8 +27,8 @@ use crate::replication::HotStandbyFeedback;
use crate::send_wal::SendWalHandler;
use crate::timeline::{Timeline, TimelineTools};
use crate::WalAcceptorConf;
use postgres_ffi::xlog_utils::{TimeLineID, XLogFileName, MAX_SEND_SIZE, XLOG_BLCKSZ};
use pageserver::waldecoder::WalStreamDecoder;
use postgres_ffi::xlog_utils::{TimeLineID, XLogFileName, MAX_SEND_SIZE, XLOG_BLCKSZ};
pub const SK_MAGIC: u32 = 0xcafeceefu32;
pub const SK_FORMAT_VERSION: u32 = 1;
@@ -85,8 +85,6 @@ pub struct SafeKeeperInfo {
pub commit_lsn: Lsn,
/// locally flushed part of WAL
pub flush_lsn: Lsn,
/// locally flushed last record LSN
pub safe_lsn: Lsn,
/// minimal LSN which may be needed for recovery of some safekeeper: min(commit_lsn) for all safekeepers
pub restart_lsn: Lsn,
}
@@ -113,7 +111,6 @@ impl SafeKeeperInfo {
},
commit_lsn: Lsn(0), /* part of WAL acknowledged by quorum */
flush_lsn: Lsn(0), /* locally flushed part of WAL */
safe_lsn: Lsn(0), /* locally flushed last record LSN */
restart_lsn: Lsn(0), /* minimal LSN which may be needed for recovery of some safekeeper */
}
}
@@ -139,7 +136,6 @@ struct SafeKeeperRequest {
struct SafeKeeperResponse {
epoch: u64,
flush_lsn: Lsn,
safe_lsn: Lsn,
hs_feedback: HotStandbyFeedback,
}
@@ -280,7 +276,6 @@ impl<'pg> ReceiveWalConn<'pg> {
/* Calculate WAL end based on local data */
let (flush_lsn, timeline_id) = this_timeline.find_end_of_wal(&swh.conf.data_dir, true);
my_info.flush_lsn = flush_lsn;
my_info.safe_lsn = flush_lsn; // end_of_wal actually returns last record LSN
my_info.server.timeline = timeline_id;
info!(
@@ -328,11 +323,14 @@ impl<'pg> ReceiveWalConn<'pg> {
}
info!(
"Start accepting WAL for timeline {} tenant {} address {:?} flush_lsn={} safe_lsn={}",
server_info.timeline_id, server_info.tenant_id, self.peer_addr, my_info.flush_lsn, my_info.safe_lsn
"Start accepting WAL for timeline {} tenant {} address {:?} flush_lsn={}",
server_info.timeline_id,
server_info.tenant_id,
self.peer_addr,
my_info.flush_lsn,
);
let mut last_rec_lsn = Lsn(0);
let mut decoder = WalStreamDecoder::new(last_rec_lsn, false);
let mut last_rec_lsn = Lsn(0);
let mut decoder = WalStreamDecoder::new(last_rec_lsn, false);
// Main loop
loop {
@@ -355,47 +353,52 @@ impl<'pg> ReceiveWalConn<'pg> {
let end_pos = req.end_lsn;
let rec_size = end_pos.checked_sub(start_pos).unwrap().0 as usize;
assert!(rec_size <= MAX_SEND_SIZE);
if rec_size != 0 {
debug!(
"received for {} bytes between {} and {}",
rec_size, start_pos, end_pos,
);
if rec_size != 0 {
debug!(
"received for {} bytes between {} and {}",
rec_size, start_pos, end_pos,
);
/* Receive message body (from the rest of the message) */
let mut buf = Vec::with_capacity(rec_size);
msg_reader.read_to_end(&mut buf)?;
assert_eq!(buf.len(), rec_size);
/* Receive message body (from the rest of the message) */
let mut buf = Vec::with_capacity(rec_size);
msg_reader.read_to_end(&mut buf)?;
assert_eq!(buf.len(), rec_size);
if decoder.available() != start_pos {
info!("Restart decoder from {} to {}", decoder.available(), start_pos);
decoder = WalStreamDecoder::new(start_pos, false);
}
decoder.feed_bytes(&buf);
//while let Ok(Some((lsn,_rec))) = decoder.poll_decode() {
loop {
match decoder.poll_decode() {
Err(e) => info!("Decode error {}", e),
Ok(None) => info!("Decode end"),
Ok(Some((lsn,_rec))) => {
last_rec_lsn = lsn;
continue;
}
}
break;
}
last_rec_lsn = Lsn((last_rec_lsn.0 + 7) & !7); // align record start position on 8
info!("Receive WAL {}..{} last_rec_lsn={}", start_pos, end_pos, last_rec_lsn);
if decoder.available() != start_pos {
info!(
"Restart decoder from {} to {}",
decoder.available(),
start_pos
);
decoder = WalStreamDecoder::new(start_pos, false);
}
decoder.feed_bytes(&buf);
loop {
match decoder.poll_decode() {
Err(e) => info!("Decode error {}", e),
Ok(None) => {},
Ok(Some((lsn, _rec))) => {
last_rec_lsn = lsn;
continue;
}
}
break;
}
info!(
"Receive WAL {}..{} last_rec_lsn={}",
start_pos, end_pos, last_rec_lsn
);
/* Save message in file */
Self::write_wal_file(
swh,
start_pos,
timeline_id,
this_timeline.get(),
wal_seg_size,
&buf,
)?;
}
/* Save message in file */
Self::write_wal_file(
swh,
start_pos,
timeline_id,
this_timeline.get(),
wal_seg_size,
&buf,
)?;
}
my_info.restart_lsn = req.restart_lsn;
my_info.commit_lsn = req.commit_lsn;
@@ -410,11 +413,8 @@ impl<'pg> ReceiveWalConn<'pg> {
my_info.epoch = prop.epoch; /* bump epoch */
sync_control_file = true;
}
if end_pos > my_info.flush_lsn {
my_info.flush_lsn = end_pos;
}
if last_rec_lsn > my_info.safe_lsn {
my_info.safe_lsn = last_rec_lsn;
if last_rec_lsn > my_info.flush_lsn {
my_info.flush_lsn = last_rec_lsn;
}
/*
* Update restart LSN in control file.
@@ -434,7 +434,6 @@ impl<'pg> ReceiveWalConn<'pg> {
let resp = SafeKeeperResponse {
epoch: my_info.epoch,
flush_lsn: my_info.flush_lsn,
safe_lsn: my_info.safe_lsn,
hs_feedback: this_timeline.get().get_hs_feedback(),
};
self.write_msg(&resp)?;
@@ -443,10 +442,15 @@ impl<'pg> ReceiveWalConn<'pg> {
* Ping wal sender that new data is available.
* FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper.
*/
info!("Notify WAL senders min({}, {})={}", req.commit_lsn, my_info.safe_lsn, min(req.commit_lsn, my_info.safe_lsn));
trace!(
"Notify WAL senders min({}, {})={}",
req.commit_lsn,
my_info.flush_lsn,
min(req.commit_lsn, my_info.flush_lsn)
);
this_timeline
.get()
.notify_wal_senders(min(req.commit_lsn, my_info.safe_lsn));
.notify_wal_senders(min(req.commit_lsn, my_info.flush_lsn));
}
Ok(())

View File

@@ -217,7 +217,12 @@ impl ReplicationConn {
data: &file_buf,
}))?;
info!("Sent WAL to page server {}..{}, end_pos={}", start_pos, start_pos + send_size as u64, end_pos);
debug!(
"Sent WAL to page server {}..{}, end_pos={}",
start_pos,
start_pos + send_size as u64,
end_pos
);
start_pos += send_size as u64;