diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py
index 57cf379a96..120e6c769b 100644
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -392,6 +392,7 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
"lm_prefix": "prefix",
"lm_message": "message",
"set_commit_lsn": True,
+ "send_proposer_elected": True,
"term": 2,
"begin_lsn": begin_lsn,
"epoch_start_lsn": epoch_start_lsn,
diff --git a/vendor/postgres b/vendor/postgres
index 08878b19d3..a70d892bb9 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit 08878b19d3cae5a1bd765bf7396187b6b806c6ac
+Subproject commit a70d892bb93e0a8a6cda8a8fccd4e4fbf408ea79
diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs
index 159ec17a9b..dc1905a5a7 100644
--- a/walkeeper/src/http/routes.rs
+++ b/walkeeper/src/http/routes.rs
@@ -7,7 +7,8 @@ use std::fmt::Display;
use std::sync::Arc;
use zenith_utils::lsn::Lsn;
-use crate::safekeeper::AcceptorState;
+use crate::safekeeper::Term;
+use crate::safekeeper::TermHistory;
use crate::timeline::CreateControlFile;
use crate::timeline::GlobalTimelines;
use crate::SafeKeeperConf;
@@ -29,6 +30,7 @@ fn get_conf(request: &Request
) -> &SafeKeeperConf {
.as_ref()
}
+/// Serialize through Display trait.
fn display_serialize(z: &F, s: S) -> Result
where
S: Serializer,
@@ -37,6 +39,14 @@ where
s.serialize_str(&format!("{}", z))
}
+/// Augment AcceptorState with epoch for convenience
+#[derive(Debug, Serialize)]
+struct AcceptorStateStatus {
+ term: Term,
+ epoch: Term,
+ term_history: TermHistory,
+}
+
/// Info about timeline on safekeeper ready for reporting.
#[derive(Debug, Serialize)]
struct TimelineStatus {
@@ -44,7 +54,7 @@ struct TimelineStatus {
tenant_id: ZTenantId,
#[serde(serialize_with = "display_serialize")]
timeline_id: ZTimelineId,
- acceptor_state: AcceptorState,
+ acceptor_state: AcceptorStateStatus,
#[serde(serialize_with = "display_serialize")]
commit_lsn: Lsn,
#[serde(serialize_with = "display_serialize")]
@@ -68,10 +78,16 @@ async fn timeline_status_handler(request: Request) -> Result Result<()> {
let response = swh.timeline.get().process_msg(&greeting_request)?;
match response {
- AcceptorProposerMessage::Greeting(_) => Ok(()),
+ Some(AcceptorProposerMessage::Greeting(_)) => Ok(()),
_ => anyhow::bail!("not GreetingResponse"),
}
}
+fn send_proposer_elected(swh: &mut SendWalHandler, term: Term, lsn: Lsn) -> Result<()> {
+ // add new term to existing history
+ let history = swh.timeline.get().get_info().acceptor_state.term_history;
+ let history = history.up_to(lsn.checked_sub(1u64).unwrap());
+ let mut history_entries = history.0;
+ history_entries.push(TermSwitchEntry { term, lsn });
+ let history = TermHistory(history_entries);
+
+ let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
+ term,
+ start_streaming_at: lsn,
+ term_history: history,
+ });
+
+ swh.timeline.get().process_msg(&proposer_elected_request)?;
+ Ok(())
+}
+
#[derive(Serialize, Deserialize)]
struct InsertedWAL {
begin_lsn: Lsn,
@@ -150,7 +176,7 @@ fn append_logical_message(
let response = swh.timeline.get().process_msg(&append_request)?;
let append_response = match response {
- AcceptorProposerMessage::AppendResponse(resp) => resp,
+ Some(AcceptorProposerMessage::AppendResponse(resp)) => resp,
_ => anyhow::bail!("not AppendResponse"),
};
diff --git a/walkeeper/src/receive_wal.rs b/walkeeper/src/receive_wal.rs
index 9498980802..a653c41922 100644
--- a/walkeeper/src/receive_wal.rs
+++ b/walkeeper/src/receive_wal.rs
@@ -4,6 +4,7 @@
use anyhow::{bail, Context, Result};
use bytes::Bytes;
+use bytes::BytesMut;
use log::*;
use postgres::{Client, Config, NoTls};
@@ -98,7 +99,7 @@ impl<'pg> ReceiveWalConn<'pg> {
// Send message to the postgres
fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> {
- let mut buf = Vec::new();
+ let mut buf = BytesMut::with_capacity(128);
msg.serialize(&mut buf)?;
self.pg_backend.write_message(&BeMessage::CopyData(&buf))?;
Ok(())
@@ -147,7 +148,9 @@ impl<'pg> ReceiveWalConn<'pg> {
.get()
.process_msg(&msg)
.with_context(|| "failed to process ProposerAcceptorMessage")?;
- self.write_msg(&reply)?;
+ if let Some(reply) = reply {
+ self.write_msg(&reply)?;
+ }
msg = self.read_msg()?;
}
}
diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs
index 0b25241165..2a15bb3fc6 100644
--- a/walkeeper/src/safekeeper.rs
+++ b/walkeeper/src/safekeeper.rs
@@ -4,16 +4,16 @@ use anyhow::Context;
use anyhow::{anyhow, bail, Result};
use byteorder::LittleEndian;
use byteorder::ReadBytesExt;
-use byteorder::WriteBytesExt;
use bytes::Buf;
+use bytes::BufMut;
use bytes::Bytes;
+use bytes::BytesMut;
use log::*;
use pageserver::waldecoder::WalStreamDecoder;
use postgres_ffi::xlog_utils::TimeLineID;
use serde::{Deserialize, Serialize};
-use std::cmp::max;
use std::cmp::min;
-use std::io;
+use std::fmt;
use std::io::Read;
use lazy_static::lazy_static;
@@ -37,6 +37,70 @@ const UNKNOWN_SERVER_VERSION: u32 = 0;
/// Consensus logical timestamp.
pub type Term = u64;
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct TermSwitchEntry {
+ pub term: Term,
+ pub lsn: Lsn,
+}
+#[derive(Clone, Serialize, Deserialize)]
+pub struct TermHistory(pub Vec);
+
+impl TermHistory {
+ pub fn empty() -> TermHistory {
+ TermHistory(Vec::new())
+ }
+
+ // Parse TermHistory as n_entries followed by TermSwitchEntry pairs
+ pub fn from_bytes(mut bytes: Bytes) -> Result {
+ if bytes.remaining() < 4 {
+ bail!("TermHistory misses len");
+ }
+ let n_entries = bytes.get_u32_le();
+ let mut res = Vec::with_capacity(n_entries as usize);
+ for _ in 0..n_entries {
+ if bytes.remaining() < 16 {
+ bail!("TermHistory is incomplete");
+ }
+ res.push(TermSwitchEntry {
+ term: bytes.get_u64_le(),
+ lsn: bytes.get_u64_le().into(),
+ })
+ }
+ Ok(TermHistory(res))
+ }
+
+ /// Return copy of self with switches happening strictly after up_to
+ /// truncated.
+ pub fn up_to(&self, up_to: Lsn) -> TermHistory {
+ let mut res = Vec::with_capacity(self.0.len());
+ for e in &self.0 {
+ if e.lsn > up_to {
+ break;
+ }
+ res.push(*e);
+ }
+ TermHistory(res)
+ }
+}
+
+/// Display only latest entries for Debug.
+impl fmt::Debug for TermHistory {
+ fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
+ let n_printed = 20;
+ write!(
+ fmt,
+ "{}{:?}",
+ if self.0.len() > n_printed { "... " } else { "" },
+ self.0
+ .iter()
+ .rev()
+ .take(n_printed)
+ .map(|&e| (e.term, e.lsn)) // omit TermSwitchEntry
+ .collect::>()
+ )
+ }
+}
+
/// Unique id of proposer. Not needed for correctness, used for monitoring.
type PgUuid = [u8; 16];
@@ -45,8 +109,21 @@ type PgUuid = [u8; 16];
pub struct AcceptorState {
/// acceptor's last term it voted for (advanced in 1 phase)
pub term: Term,
- /// acceptor's epoch (advanced, i.e. bumped to 'term' when VCL is reached).
- pub epoch: Term,
+ /// History of term switches for safekeeper's WAL.
+ /// Actually it often goes *beyond* WAL contents as we adopt term history
+ /// from the proposer before recovery.
+ pub term_history: TermHistory,
+}
+
+impl AcceptorState {
+ /// acceptor's epoch is the term of the highest entry in the log
+ pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
+ let th = self.term_history.up_to(flush_lsn);
+ match th.0.last() {
+ Some(e) => e.term,
+ None => 0,
+ }
+ }
}
/// Information about Postgres. Safekeeper gets it once and then verifies
@@ -91,7 +168,10 @@ impl SafeKeeperState {
SafeKeeperState {
magic: SK_MAGIC,
format_version: SK_FORMAT_VERSION,
- acceptor_state: AcceptorState { term: 0, epoch: 0 },
+ acceptor_state: AcceptorState {
+ term: 0,
+ term_history: TermHistory::empty(),
+ },
server: ServerInfo {
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
system_id: 0, /* Postgres system identifier */
@@ -147,16 +227,28 @@ pub struct VoteRequest {
/// Vote itself, sent from safekeeper to proposer
#[derive(Debug, Serialize)]
pub struct VoteResponse {
- term: Term, // not really needed, just a sanity check
+ term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
vote_given: u64, // fixme u64 due to padding
- /// Safekeeper's log position, to let proposer choose the most advanced one
- epoch: Term,
+ // Safekeeper flush_lsn (end of WAL) + history of term switches allow
+ // proposer to choose the most advanced one.
flush_lsn: Lsn,
truncate_lsn: Lsn,
+ term_history: TermHistory,
+}
+
+/*
+ * Proposer -> Acceptor message announcing proposer is elected and communicating
+ * term history to it.
+ */
+#[derive(Debug)]
+pub struct ProposerElected {
+ pub term: Term,
+ pub start_streaming_at: Lsn,
+ pub term_history: TermHistory,
}
/// Request with WAL message sent from proposer to safekeeper. Along the way it
-/// announces 1) successful election (with epoch_start_lsn); 2) commit_lsn.
+/// communicates commit_lsn.
#[derive(Debug)]
pub struct AppendRequest {
pub h: AppendRequestHeader,
@@ -164,6 +256,7 @@ pub struct AppendRequest {
}
#[derive(Debug, Clone, Deserialize)]
pub struct AppendRequestHeader {
+ // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
pub term: Term,
// LSN since the proposer appends WAL; determines epoch switch point.
pub epoch_start_lsn: Lsn,
@@ -185,7 +278,6 @@ pub struct AppendResponse {
// Current term of the safekeeper; if it is higher than proposer's, the
// compute is out of date.
pub term: Term,
- pub epoch: Term,
// NOTE: this is physical end of wal on safekeeper; currently it doesn't
// make much sense without taking epoch into account, as history can be
// diverged.
@@ -198,19 +290,32 @@ pub struct AppendResponse {
pub hs_feedback: HotStandbyFeedback,
}
+impl AppendResponse {
+ fn term_only(term: Term) -> AppendResponse {
+ AppendResponse {
+ term,
+ flush_lsn: Lsn(0),
+ commit_lsn: Lsn(0),
+ disk_consistent_lsn: Lsn(0),
+ hs_feedback: HotStandbyFeedback::empty(),
+ }
+ }
+}
+
/// Proposer -> Acceptor messages
#[derive(Debug)]
pub enum ProposerAcceptorMessage {
Greeting(ProposerGreeting),
VoteRequest(VoteRequest),
+ Elected(ProposerElected),
AppendRequest(AppendRequest),
}
impl ProposerAcceptorMessage {
/// Parse proposer message.
- pub fn parse(msg: Bytes) -> Result {
+ pub fn parse(msg_bytes: Bytes) -> Result {
// xxx using Reader is inefficient but easy to work with bincode
- let mut stream = msg.reader();
+ let mut stream = msg_bytes.reader();
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
let tag = stream.read_u64::()? as u8 as char;
match tag {
@@ -222,6 +327,21 @@ impl ProposerAcceptorMessage {
let msg = VoteRequest::des_from(&mut stream)?;
Ok(ProposerAcceptorMessage::VoteRequest(msg))
}
+ 'e' => {
+ let mut msg_bytes = stream.into_inner();
+ if msg_bytes.remaining() < 16 {
+ bail!("ProposerElected message is not complete");
+ }
+ let term = msg_bytes.get_u64_le();
+ let start_streaming_at = msg_bytes.get_u64_le().into();
+ let term_history = TermHistory::from_bytes(msg_bytes)?;
+ let msg = ProposerElected {
+ term,
+ start_streaming_at,
+ term_history,
+ };
+ Ok(ProposerAcceptorMessage::Elected(msg))
+ }
'a' => {
// read header followed by wal data
let hdr = AppendRequestHeader::des_from(&mut stream)?;
@@ -259,19 +379,33 @@ pub enum AcceptorProposerMessage {
impl AcceptorProposerMessage {
/// Serialize acceptor -> proposer message.
- pub fn serialize(&self, stream: &mut impl io::Write) -> Result<()> {
+ pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
match self {
AcceptorProposerMessage::Greeting(msg) => {
- stream.write_u64::('g' as u64)?;
- msg.ser_into(stream)?;
+ buf.put_u64_le('g' as u64);
+ buf.put_u64_le(msg.term);
}
AcceptorProposerMessage::VoteResponse(msg) => {
- stream.write_u64::('v' as u64)?;
- msg.ser_into(stream)?;
+ buf.put_u64_le('v' as u64);
+ buf.put_u64_le(msg.term);
+ buf.put_u64_le(msg.vote_given);
+ buf.put_u64_le(msg.flush_lsn.into());
+ buf.put_u64_le(msg.truncate_lsn.into());
+ buf.put_u32_le(msg.term_history.0.len() as u32);
+ for e in &msg.term_history.0 {
+ buf.put_u64_le(e.term);
+ buf.put_u64_le(e.lsn.into());
+ }
}
AcceptorProposerMessage::AppendResponse(msg) => {
- stream.write_u64::('a' as u64)?;
- msg.ser_into(stream)?;
+ buf.put_u64_le('a' as u64);
+ buf.put_u64_le(msg.term);
+ buf.put_u64_le(msg.flush_lsn.into());
+ buf.put_u64_le(msg.commit_lsn.into());
+ buf.put_u64_le(msg.disk_consistent_lsn.into());
+ buf.put_i64_le(msg.hs_feedback.ts);
+ buf.put_u64_le(msg.hs_feedback.xmin);
+ buf.put_u64_le(msg.hs_feedback.catalog_xmin);
}
}
@@ -284,6 +418,8 @@ pub trait Storage {
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()>;
/// Write piece of wal in buf to disk and sync it.
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>;
+ // Truncate WAL at specified LSN
+ fn truncate_wal(&mut self, s: &ServerInfo, endpos: Lsn) -> Result<()>;
}
lazy_static! {
@@ -357,8 +493,7 @@ pub struct SafeKeeper {
pub commit_lsn: Lsn,
pub truncate_lsn: Lsn,
pub storage: ST,
- pub s: SafeKeeperState, // persistent part
- pub elected_proposer_term: Term, // for monitoring/debugging
+ pub s: SafeKeeperState, // persistent part
decoder: WalStreamDecoder,
}
@@ -375,27 +510,40 @@ where
truncate_lsn: state.truncate_lsn,
storage,
s: state,
- elected_proposer_term: 0,
decoder: WalStreamDecoder::new(Lsn(0)),
}
}
+ /// Get history of term switches for the available WAL
+ fn get_term_history(&self) -> TermHistory {
+ self.s.acceptor_state.term_history.up_to(self.flush_lsn)
+ }
+
+ #[cfg(test)]
+ fn get_epoch(&self) -> Term {
+ self.s.acceptor_state.get_epoch(self.flush_lsn)
+ }
+
/// Process message from proposer and possibly form reply. Concurrent
/// callers must exclude each other.
pub fn process_msg(
&mut self,
msg: &ProposerAcceptorMessage,
- ) -> Result {
+ ) -> Result