mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 13:32:57 +00:00
Instead of playing games during serialize/deserialize, just treat NodeId::term as an 8-byte array instead of a u64.
1110 lines
38 KiB
Rust
1110 lines
38 KiB
Rust
///
|
|
/// WAL service listens for client connections and
|
|
/// receive WAL from wal_proposer and send it to WAL receivers
|
|
///
|
|
use byteorder::{BigEndian, ByteOrder};
|
|
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
|
use fs2::FileExt;
|
|
use lazy_static::lazy_static;
|
|
use log::*;
|
|
use postgres::{Client, NoTls};
|
|
use regex::Regex;
|
|
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
|
use std::cmp::{max, min};
|
|
use std::collections::HashMap;
|
|
use std::fs::{self, File, OpenOptions};
|
|
use std::io::{self, prelude::*, SeekFrom};
|
|
use std::mem;
|
|
use std::net::{TcpListener, TcpStream};
|
|
use std::str;
|
|
use std::sync::{Arc, Condvar, Mutex};
|
|
use std::thread;
|
|
use zenith_utils::bin_ser::LeSer;
|
|
|
|
use crate::pq_protocol::*;
|
|
use crate::WalAcceptorConf;
|
|
use pageserver::ZTimelineId;
|
|
use postgres_ffi::xlog_utils::*;
|
|
|
|
type FullTransactionId = u64;
|
|
|
|
const SK_MAGIC: u32 = 0xCafeCeefu32;
|
|
const SK_FORMAT_VERSION: u32 = 1;
|
|
const SK_PROTOCOL_VERSION: u32 = 1;
|
|
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
|
const END_REPLICATION_MARKER: u64 = u64::MAX;
|
|
const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
|
const XLOG_HDR_SIZE: usize = 1 + 8 * 3; /* 'w' + startPos + walEnd + timestamp */
|
|
const LIBPQ_HDR_SIZE: usize = 5; /* 1 byte with message type + 4 bytes length */
|
|
const LIBPQ_MSG_SIZE_OFFS: usize = 1;
|
|
const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
|
const END_OF_STREAM: XLogRecPtr = 0;
|
|
|
|
/// Read some bytes from a type that implements [`Read`] into a [`BytesMut`]
|
|
///
|
|
/// Will return the number of bytes read, just like `Read::read()` would.
|
|
///
|
|
fn read_into(r: &mut impl Read, buf: &mut BytesMut) -> io::Result<usize> {
|
|
// This is a workaround, because BytesMut and std::io don't play
|
|
// well together.
|
|
//
|
|
// I think this code needs to go away, and I'm confident that
|
|
// that's possible, if we are willing to refactor this code to
|
|
// use std::io::BufReader instead of doing buffer management
|
|
// ourselves.
|
|
//
|
|
// SAFETY: we already have exclusive access to self.inbuf, so
|
|
// there are no concurrency problems; the only risk would be
|
|
// accidentally exposing uninitialized parts of the buffer.
|
|
//
|
|
// We write into the buffer just past the known-initialized part,
|
|
// then manually increment its length by the exact number of
|
|
// bytes we read. So no uninitialized memory should be exposed.
|
|
|
|
let start = buf.len();
|
|
let end = buf.capacity();
|
|
|
|
let num_bytes = unsafe {
|
|
let fill_here = buf.get_unchecked_mut(start..end);
|
|
let num_bytes_read = r.read(fill_here)?;
|
|
buf.set_len(start + num_bytes_read);
|
|
num_bytes_read
|
|
};
|
|
Ok(num_bytes)
|
|
}
|
|
|
|
/// Unique node identifier used by Paxos
|
|
#[repr(C)]
|
|
#[derive(Debug, Clone, Copy, Ord, PartialOrd, PartialEq, Eq, Serialize, Deserialize)]
|
|
struct NodeId {
|
|
uuid: u128,
|
|
term: [u8; 8],
|
|
}
|
|
|
|
#[repr(C)]
|
|
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
|
struct ServerInfo {
|
|
/// proxy-safekeeper protocol version
|
|
protocol_version: u32,
|
|
/// Postgres server version
|
|
pg_version: u32,
|
|
node_id: NodeId,
|
|
system_id: SystemId,
|
|
/// Zenith timelineid
|
|
timeline_id: ZTimelineId,
|
|
wal_end: XLogRecPtr,
|
|
timeline: TimeLineID,
|
|
wal_seg_size: u32,
|
|
}
|
|
|
|
/// Vote request sent from proxy to safekeepers
|
|
#[repr(C)]
|
|
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
|
struct RequestVote {
|
|
node_id: NodeId,
|
|
/// volume commit LSN
|
|
vcl: XLogRecPtr,
|
|
/// new epoch when safekeeper reaches vcl
|
|
epoch: u64,
|
|
}
|
|
|
|
/// Information of about storage node
|
|
#[repr(C)]
|
|
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
|
struct SafeKeeperInfo {
|
|
/// magic for verifying content the control file
|
|
magic: u32,
|
|
/// safekeeper format version
|
|
format_version: u32,
|
|
/// safekeeper's epoch
|
|
epoch: u64,
|
|
/// information about server
|
|
server: ServerInfo,
|
|
/// part of WAL acknowledged by quorum
|
|
commit_lsn: XLogRecPtr,
|
|
/// locally flushed part of WAL
|
|
flush_lsn: XLogRecPtr,
|
|
/// minimal LSN which may be needed for recovery of some safekeeper: min(commit_lsn) for all safekeepers
|
|
restart_lsn: XLogRecPtr,
|
|
}
|
|
|
|
/// Hot standby feedback received from replica
|
|
#[repr(C)]
|
|
#[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)]
|
|
struct HotStandbyFeedback {
|
|
ts: TimestampTz,
|
|
xmin: FullTransactionId,
|
|
catalog_xmin: FullTransactionId,
|
|
}
|
|
|
|
/// Request with WAL message sent from proxy to safekeeper.
|
|
#[repr(C)]
|
|
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
|
struct SafeKeeperRequest {
|
|
/// Sender's node identifier (looks like we do not need it for TCP streaming connection)
|
|
sender_id: NodeId,
|
|
/// start position of message in WAL
|
|
begin_lsn: XLogRecPtr,
|
|
/// end position of message in WAL
|
|
end_lsn: XLogRecPtr,
|
|
/// restart LSN position (minimal LSN which may be needed by proxy to perform recovery)
|
|
restart_lsn: XLogRecPtr,
|
|
/// LSN committed by quorum of safekeepers
|
|
commit_lsn: XLogRecPtr,
|
|
}
|
|
|
|
/// Report safekeeper state to proxy
|
|
#[repr(C)]
|
|
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
|
struct SafeKeeperResponse {
|
|
epoch: u64,
|
|
flush_lsn: XLogRecPtr,
|
|
hs_feedback: HotStandbyFeedback,
|
|
}
|
|
|
|
/// Shared state associated with database instance (tenant)
|
|
#[derive(Debug)]
|
|
struct SharedState {
|
|
/// quorum commit LSN
|
|
commit_lsn: XLogRecPtr,
|
|
/// information about this safekeeper
|
|
info: SafeKeeperInfo,
|
|
/// opened file control file handle (needed to hold exlusive file lock
|
|
control_file: Option<File>,
|
|
/// combined hot standby feedback from all replicas
|
|
hs_feedback: HotStandbyFeedback,
|
|
}
|
|
|
|
/// Database instance (tenant)
|
|
#[derive(Debug)]
|
|
pub struct Timeline {
|
|
timelineid: ZTimelineId,
|
|
mutex: Mutex<SharedState>,
|
|
/// conditional variable used to notify wal senders
|
|
cond: Condvar,
|
|
}
|
|
|
|
/// Private data
|
|
#[derive(Debug)]
|
|
struct Connection {
|
|
timeline: Option<Arc<Timeline>>,
|
|
/// Postgres connection
|
|
stream: TcpStream,
|
|
/// input buffer
|
|
inbuf: BytesMut,
|
|
/// output buffer
|
|
outbuf: BytesMut,
|
|
/// startup packet proceeded
|
|
init_done: bool,
|
|
/// assigned application name
|
|
appname: Option<String>,
|
|
/// wal acceptor configuration
|
|
conf: WalAcceptorConf,
|
|
}
|
|
|
|
/// Serde adapter for BytesMut
|
|
///
|
|
// It's not clear whether this will be needed in the long term.
|
|
// If so, it should probably move to `zenith_utils::bin_ser`
|
|
trait NewSerializer: Serialize + DeserializeOwned {
|
|
fn pack(&self, buf: &mut BytesMut) {
|
|
let mut buf_w = buf.writer();
|
|
self.ser_into(&mut buf_w).unwrap();
|
|
}
|
|
|
|
fn unpack(buf: &mut BytesMut) -> Self {
|
|
let buf_r = buf.reader();
|
|
Self::des_from(buf_r).unwrap()
|
|
}
|
|
}
|
|
|
|
impl<T> NewSerializer for T where T: Serialize + DeserializeOwned {}
|
|
|
|
/// Report and return IO error
|
|
macro_rules! io_error {
|
|
($($arg:tt)*) => (error!($($arg)*); return Err(io::Error::new(io::ErrorKind::Other,format!($($arg)*))))
|
|
}
|
|
|
|
/// Safe hex string parser returning proper result
|
|
fn parse_hex_str(s: &str) -> Result<u64> {
|
|
if let Ok(val) = u32::from_str_radix(s, 16) {
|
|
Ok(val as u64)
|
|
} else {
|
|
io_error!("Invalid hex number {}", s);
|
|
}
|
|
}
|
|
|
|
impl SafeKeeperInfo {
|
|
fn new() -> SafeKeeperInfo {
|
|
SafeKeeperInfo {
|
|
magic: SK_MAGIC,
|
|
format_version: SK_FORMAT_VERSION,
|
|
epoch: 0,
|
|
server: ServerInfo {
|
|
protocol_version: SK_PROTOCOL_VERSION, /* proxy-safekeeper protocol version */
|
|
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
|
|
node_id: NodeId {
|
|
term: [0; 8],
|
|
uuid: 0,
|
|
},
|
|
system_id: 0, /* Postgres system identifier */
|
|
timeline_id: ZTimelineId::from([0u8; 16]),
|
|
wal_end: 0,
|
|
timeline: 0,
|
|
wal_seg_size: 0,
|
|
},
|
|
commit_lsn: 0, /* part of WAL acknowledged by quorum */
|
|
flush_lsn: 0, /* locally flushed part of WAL */
|
|
restart_lsn: 0, /* minimal LSN which may be needed for recovery of some safekeeper */
|
|
}
|
|
}
|
|
}
|
|
|
|
impl HotStandbyFeedback {
|
|
fn parse(body: &Bytes) -> HotStandbyFeedback {
|
|
HotStandbyFeedback {
|
|
ts: BigEndian::read_u64(&body[0..8]),
|
|
xmin: BigEndian::read_u64(&body[8..16]),
|
|
catalog_xmin: BigEndian::read_u64(&body[16..24]),
|
|
}
|
|
}
|
|
}
|
|
|
|
lazy_static! {
|
|
pub static ref TIMELINES: Mutex<HashMap<ZTimelineId, Arc<Timeline>>> =
|
|
Mutex::new(HashMap::new());
|
|
}
|
|
|
|
pub fn thread_main(conf: WalAcceptorConf) {
|
|
info!("Starting wal acceptor on {}", conf.listen_addr);
|
|
main_loop(&conf).unwrap();
|
|
}
|
|
|
|
fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
|
|
let listener = TcpListener::bind(conf.listen_addr)?;
|
|
loop {
|
|
match listener.accept() {
|
|
Ok((socket, peer_addr)) => {
|
|
debug!("accepted connection from {}", peer_addr);
|
|
socket.set_nodelay(true)?;
|
|
let mut conn = Connection::new(socket, &conf);
|
|
thread::spawn(move || {
|
|
if let Err(err) = conn.run() {
|
|
error!("error: {}", err);
|
|
}
|
|
});
|
|
}
|
|
Err(e) => error!("Failed to accept connection: {}", e),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Timeline {
|
|
pub fn new(timelineid: ZTimelineId) -> Timeline {
|
|
let shared_state = SharedState {
|
|
commit_lsn: 0,
|
|
info: SafeKeeperInfo::new(),
|
|
control_file: None,
|
|
hs_feedback: HotStandbyFeedback {
|
|
ts: 0,
|
|
xmin: u64::MAX,
|
|
catalog_xmin: u64::MAX,
|
|
},
|
|
};
|
|
Timeline {
|
|
timelineid,
|
|
mutex: Mutex::new(shared_state),
|
|
cond: Condvar::new(),
|
|
}
|
|
}
|
|
|
|
// Notify caught-up WAL senders about new WAL data received
|
|
fn notify_wal_senders(&self, commit_lsn: XLogRecPtr) {
|
|
let mut shared_state = self.mutex.lock().unwrap();
|
|
if shared_state.commit_lsn < commit_lsn {
|
|
shared_state.commit_lsn = commit_lsn;
|
|
self.cond.notify_all();
|
|
}
|
|
}
|
|
|
|
fn _stop_wal_senders(&self) {
|
|
self.notify_wal_senders(END_REPLICATION_MARKER);
|
|
}
|
|
|
|
fn get_info(&self) -> SafeKeeperInfo {
|
|
return self.mutex.lock().unwrap().info;
|
|
}
|
|
|
|
fn set_info(&self, info: &SafeKeeperInfo) {
|
|
self.mutex.lock().unwrap().info = *info;
|
|
}
|
|
|
|
// Accumulate hot standby feedbacks from replicas
|
|
fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
|
|
let mut shared_state = self.mutex.lock().unwrap();
|
|
shared_state.hs_feedback.xmin = min(shared_state.hs_feedback.xmin, feedback.xmin);
|
|
shared_state.hs_feedback.catalog_xmin =
|
|
min(shared_state.hs_feedback.catalog_xmin, feedback.catalog_xmin);
|
|
shared_state.hs_feedback.ts = max(shared_state.hs_feedback.ts, feedback.ts);
|
|
}
|
|
|
|
fn get_hs_feedback(&self) -> HotStandbyFeedback {
|
|
let shared_state = self.mutex.lock().unwrap();
|
|
shared_state.hs_feedback
|
|
}
|
|
|
|
// Load and lock control file (prevent running more than one instance of safekeeper)
|
|
fn load_control_file(&self, conf: &WalAcceptorConf) -> Result<()> {
|
|
let mut shared_state = self.mutex.lock().unwrap();
|
|
|
|
if shared_state.control_file.is_some() {
|
|
info!(
|
|
"control file for timeline {} is already open",
|
|
self.timelineid
|
|
);
|
|
return Ok(());
|
|
}
|
|
|
|
let control_file_path = conf
|
|
.data_dir
|
|
.join(self.timelineid.to_string())
|
|
.join(CONTROL_FILE_NAME);
|
|
info!("loading control file {}", control_file_path.display());
|
|
match OpenOptions::new()
|
|
.read(true)
|
|
.write(true)
|
|
.create(true)
|
|
.open(&control_file_path)
|
|
{
|
|
Ok(file) => {
|
|
// Lock file to prevent two or more active wal_acceptors
|
|
match file.try_lock_exclusive() {
|
|
Ok(()) => {}
|
|
Err(e) => {
|
|
io_error!(
|
|
"Control file {:?} is locked by some other process: {}",
|
|
&control_file_path,
|
|
e
|
|
);
|
|
}
|
|
}
|
|
shared_state.control_file = Some(file);
|
|
|
|
const SIZE: usize = mem::size_of::<SafeKeeperInfo>();
|
|
let mut buf = [0u8; SIZE];
|
|
if shared_state
|
|
.control_file
|
|
.as_mut()
|
|
.unwrap()
|
|
.read_exact(&mut buf)
|
|
.is_ok()
|
|
{
|
|
let mut input = BytesMut::new();
|
|
input.extend_from_slice(&buf);
|
|
let my_info = SafeKeeperInfo::unpack(&mut input);
|
|
|
|
if my_info.magic != SK_MAGIC {
|
|
io_error!("Invalid control file magic: {}", my_info.magic);
|
|
}
|
|
if my_info.format_version != SK_FORMAT_VERSION {
|
|
io_error!(
|
|
"Incompatible format version: {} vs. {}",
|
|
my_info.format_version,
|
|
SK_FORMAT_VERSION
|
|
);
|
|
}
|
|
shared_state.info = my_info;
|
|
}
|
|
}
|
|
Err(e) => {
|
|
panic!(
|
|
"Failed to open control file {:?}: {}",
|
|
&control_file_path, e
|
|
);
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn save_control_file(&self, sync: bool) -> Result<()> {
|
|
let mut buf = BytesMut::new();
|
|
let mut shared_state = self.mutex.lock().unwrap();
|
|
shared_state.info.pack(&mut buf);
|
|
|
|
let file = shared_state.control_file.as_mut().unwrap();
|
|
file.seek(SeekFrom::Start(0))?;
|
|
file.write_all(&buf[..])?;
|
|
if sync {
|
|
file.sync_all()?;
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
impl Connection {
|
|
pub fn new(socket: TcpStream, conf: &WalAcceptorConf) -> Connection {
|
|
Connection {
|
|
timeline: None,
|
|
stream: socket,
|
|
inbuf: BytesMut::with_capacity(10 * 1024),
|
|
outbuf: BytesMut::with_capacity(10 * 1024),
|
|
init_done: false,
|
|
appname: None,
|
|
conf: conf.clone(),
|
|
}
|
|
}
|
|
|
|
fn timeline(&self) -> Arc<Timeline> {
|
|
self.timeline.as_ref().unwrap().clone()
|
|
}
|
|
|
|
fn run(&mut self) -> Result<()> {
|
|
self.inbuf.resize(4, 0u8);
|
|
self.stream.read_exact(&mut self.inbuf[0..4])?;
|
|
let startup_pkg_len = BigEndian::read_u32(&self.inbuf[0..4]);
|
|
if startup_pkg_len == 0 {
|
|
self.receive_wal()?; // internal protocol between wal_proposer and wal_acceptor
|
|
} else {
|
|
self.send_wal()?; // libpq replication protocol between wal_acceptor and replicas/pagers
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn read_req<T: NewSerializer>(&mut self) -> Result<T> {
|
|
let size = mem::size_of::<T>();
|
|
self.inbuf.resize(size, 0u8);
|
|
self.stream.read_exact(&mut self.inbuf[0..size])?;
|
|
Ok(T::unpack(&mut self.inbuf))
|
|
}
|
|
|
|
fn request_callback(&self) -> std::result::Result<(), postgres::error::Error> {
|
|
if let Some(addr) = self.conf.pageserver_addr {
|
|
let ps_connstr = format!(
|
|
"host={} port={} dbname={} user={}",
|
|
addr.ip(),
|
|
addr.port(),
|
|
"no_db",
|
|
"no_user",
|
|
);
|
|
let callme = format!(
|
|
"callmemaybe {} host={} port={} options='-c ztimelineid={}'",
|
|
self.timeline().timelineid,
|
|
self.conf.listen_addr.ip(),
|
|
self.conf.listen_addr.port(),
|
|
self.timeline().timelineid
|
|
);
|
|
info!(
|
|
"requesting page server to connect to us: start {} {}",
|
|
ps_connstr, callme
|
|
);
|
|
let mut client = Client::connect(&ps_connstr, NoTls)?;
|
|
client.simple_query(&callme)?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn set_timeline(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
|
let mut timelines = TIMELINES.lock().unwrap();
|
|
if !timelines.contains_key(&timelineid) {
|
|
info!("creating timeline dir {}", timelineid);
|
|
fs::create_dir_all(timelineid.to_string())?;
|
|
timelines.insert(timelineid, Arc::new(Timeline::new(timelineid)));
|
|
}
|
|
self.timeline = Some(timelines.get(&timelineid).unwrap().clone());
|
|
Ok(())
|
|
}
|
|
|
|
// Receive WAL from wal_proposer
|
|
fn receive_wal(&mut self) -> Result<()> {
|
|
// Receive information about server
|
|
let server_info = self.read_req::<ServerInfo>()?;
|
|
info!(
|
|
"Start handshake with wal_proposer {} sysid {} timeline {}",
|
|
self.stream.peer_addr()?,
|
|
server_info.system_id,
|
|
server_info.timeline_id,
|
|
);
|
|
// FIXME: also check that the system identifier matches
|
|
self.set_timeline(server_info.timeline_id)?;
|
|
self.timeline().load_control_file(&self.conf)?;
|
|
|
|
let mut my_info = self.timeline().get_info();
|
|
|
|
/* Check protocol compatibility */
|
|
if server_info.protocol_version != SK_PROTOCOL_VERSION {
|
|
io_error!(
|
|
"Incompatible protocol version {} vs. {}",
|
|
server_info.protocol_version,
|
|
SK_PROTOCOL_VERSION
|
|
);
|
|
}
|
|
/* Postgres upgrade is not treated as fatal error */
|
|
if server_info.pg_version != my_info.server.pg_version
|
|
&& my_info.server.pg_version != UNKNOWN_SERVER_VERSION
|
|
{
|
|
info!(
|
|
"Server version doesn't match {} vs. {}",
|
|
server_info.pg_version, my_info.server.pg_version
|
|
);
|
|
}
|
|
/* Update information about server, but preserve locally stored node_id */
|
|
let node_id = my_info.server.node_id;
|
|
my_info.server = server_info;
|
|
my_info.server.node_id = node_id;
|
|
|
|
/* Calculate WAL end based on local data */
|
|
let (flush_lsn, timeline) = self.find_end_of_wal(true);
|
|
my_info.flush_lsn = flush_lsn;
|
|
my_info.server.timeline = timeline;
|
|
|
|
/* Report my identifier to proxy */
|
|
self.start_sending();
|
|
my_info.pack(&mut self.outbuf);
|
|
self.send()?;
|
|
|
|
/* Wait for vote request */
|
|
let prop = self.read_req::<RequestVote>()?;
|
|
/* This is Paxos check which should ensure that only one master can perform commits */
|
|
if prop.node_id < my_info.server.node_id {
|
|
/* Send my node-id to inform proxy that it's candidate was rejected */
|
|
self.start_sending();
|
|
my_info.server.node_id.pack(&mut self.outbuf);
|
|
self.send()?;
|
|
io_error!(
|
|
"Reject connection attempt with term {} because my term is {}",
|
|
hex::encode(prop.node_id.term),
|
|
hex::encode(my_info.server.node_id.term)
|
|
);
|
|
}
|
|
my_info.server.node_id = prop.node_id;
|
|
self.timeline().set_info(&my_info);
|
|
/* Need to persist our vote first */
|
|
self.timeline().save_control_file(true)?;
|
|
|
|
let mut flushed_restart_lsn: XLogRecPtr = 0;
|
|
let wal_seg_size = server_info.wal_seg_size as usize;
|
|
|
|
/* Acknowledge the proposed candidate by returning it to the proxy */
|
|
self.start_sending();
|
|
prop.node_id.pack(&mut self.outbuf);
|
|
self.send()?;
|
|
|
|
// Need to establish replication channel with page server.
|
|
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
|
if let Err(e) = self.request_callback() {
|
|
// Do not treate it as fatal error and continue work
|
|
// FIXME: we should retry after a while...
|
|
error!("Failed to send callme request to pageserver: {}", e);
|
|
}
|
|
|
|
info!(
|
|
"Start streaming from timeline {} address {:?}",
|
|
server_info.timeline_id,
|
|
self.stream.peer_addr()?
|
|
);
|
|
|
|
// Main loop
|
|
loop {
|
|
let mut sync_control_file = false;
|
|
|
|
/* Receive message header */
|
|
let req = self.read_req::<SafeKeeperRequest>()?;
|
|
if req.sender_id != my_info.server.node_id {
|
|
io_error!("Sender NodeId is changed");
|
|
}
|
|
if req.begin_lsn == END_OF_STREAM {
|
|
info!("Server stops streaming");
|
|
break;
|
|
}
|
|
let start_pos = req.begin_lsn;
|
|
let end_pos = req.end_lsn;
|
|
let rec_size = (end_pos - start_pos) as usize;
|
|
assert!(rec_size <= MAX_SEND_SIZE);
|
|
|
|
debug!(
|
|
"received for {} bytes between {:X}/{:X} and {:X}/{:X}",
|
|
rec_size,
|
|
start_pos >> 32,
|
|
start_pos & 0xffffffff,
|
|
end_pos >> 32,
|
|
end_pos & 0xffffffff
|
|
);
|
|
|
|
/* Receive message body */
|
|
self.inbuf.resize(rec_size, 0u8);
|
|
self.stream.read_exact(&mut self.inbuf[0..rec_size])?;
|
|
|
|
/* Save message in file */
|
|
self.write_wal_file(start_pos, timeline, wal_seg_size, &self.inbuf[0..rec_size])?;
|
|
|
|
my_info.restart_lsn = req.restart_lsn;
|
|
my_info.commit_lsn = req.commit_lsn;
|
|
|
|
/*
|
|
* Epoch switch happen when written WAL record cross the boundary.
|
|
* The boundary is maximum of last WAL position at this node (FlushLSN) and global
|
|
* maximum (vcl) determined by safekeeper_proxy during handshake.
|
|
* Switching epoch means that node completes recovery and start writing in the WAL new data.
|
|
*/
|
|
if my_info.epoch < prop.epoch && end_pos > max(my_info.flush_lsn, prop.vcl) {
|
|
info!("Switch to new epoch {}", prop.epoch);
|
|
my_info.epoch = prop.epoch; /* bump epoch */
|
|
sync_control_file = true;
|
|
}
|
|
if end_pos > my_info.flush_lsn {
|
|
my_info.flush_lsn = end_pos;
|
|
}
|
|
/*
|
|
* Update restart LSN in control file.
|
|
* To avoid negative impact on performance of extra fsync, do it only
|
|
* when restart_lsn delta exceeds WAL segment size.
|
|
*/
|
|
sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn;
|
|
self.timeline().save_control_file(sync_control_file)?;
|
|
|
|
if sync_control_file {
|
|
flushed_restart_lsn = my_info.restart_lsn;
|
|
}
|
|
|
|
/* Report flush position */
|
|
//info!("Confirm LSN: {:X}/{:>08X}", (end_pos>>32) as u32, end_pos as u32);
|
|
let resp = SafeKeeperResponse {
|
|
epoch: my_info.epoch,
|
|
flush_lsn: end_pos,
|
|
hs_feedback: self.timeline().get_hs_feedback(),
|
|
};
|
|
self.start_sending();
|
|
resp.pack(&mut self.outbuf);
|
|
self.send()?;
|
|
|
|
/*
|
|
* Ping wal sender that new data is available.
|
|
* FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper.
|
|
*/
|
|
self.timeline()
|
|
.notify_wal_senders(min(req.commit_lsn, end_pos));
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
//
|
|
// Read full message or return None if connection is closed
|
|
//
|
|
fn read_message(&mut self) -> Result<Option<FeMessage>> {
|
|
loop {
|
|
if let Some(message) = self.parse_message()? {
|
|
return Ok(Some(message));
|
|
}
|
|
|
|
if read_into(&mut self.stream, &mut self.inbuf)? == 0 {
|
|
if self.inbuf.is_empty() {
|
|
return Ok(None);
|
|
} else {
|
|
io_error!("connection reset by peer");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Parse libpq message
|
|
//
|
|
fn parse_message(&mut self) -> Result<Option<FeMessage>> {
|
|
if !self.init_done {
|
|
FeStartupMessage::parse(&mut self.inbuf)
|
|
} else {
|
|
FeMessage::parse(&mut self.inbuf)
|
|
}
|
|
}
|
|
|
|
//
|
|
// Reset output buffer to start accumulating data of new message
|
|
//
|
|
fn start_sending(&mut self) {
|
|
self.outbuf.clear();
|
|
}
|
|
|
|
//
|
|
// Send buffered messages
|
|
//
|
|
fn send(&mut self) -> Result<()> {
|
|
self.stream.write_all(&self.outbuf)
|
|
}
|
|
|
|
//
|
|
// Send WAL to replica or WAL receiver using standard libpq replication protocol
|
|
//
|
|
fn send_wal(&mut self) -> Result<()> {
|
|
info!("WAL sender to {:?} is started", self.stream.peer_addr()?);
|
|
loop {
|
|
self.start_sending();
|
|
match self.read_message()? {
|
|
Some(FeMessage::StartupMessage(m)) => {
|
|
trace!("got message {:?}", m);
|
|
|
|
match m.kind {
|
|
StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
|
|
BeMessage::write(&mut self.outbuf, &BeMessage::Negotiate);
|
|
info!("SSL requested");
|
|
self.send()?;
|
|
}
|
|
StartupRequestCode::Normal => {
|
|
BeMessage::write(&mut self.outbuf, &BeMessage::AuthenticationOk);
|
|
BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery);
|
|
self.send()?;
|
|
self.init_done = true;
|
|
self.set_timeline(m.timelineid)?;
|
|
self.appname = m.appname;
|
|
}
|
|
StartupRequestCode::Cancel => return Ok(()),
|
|
}
|
|
}
|
|
Some(FeMessage::Query(m)) => {
|
|
if !self.process_query(&m)? {
|
|
break;
|
|
}
|
|
}
|
|
Some(FeMessage::Terminate) => {
|
|
break;
|
|
}
|
|
None => {
|
|
info!("connection closed");
|
|
break;
|
|
}
|
|
_ => {
|
|
io_error!("unexpected message");
|
|
}
|
|
}
|
|
}
|
|
info!("WAL sender to {:?} is finished", self.stream.peer_addr()?);
|
|
Ok(())
|
|
}
|
|
|
|
//
|
|
// Handle IDENTIFY_SYSTEM replication command
|
|
//
|
|
fn handle_identify_system(&mut self) -> Result<bool> {
|
|
let (start_pos, timeline) = self.find_end_of_wal(false);
|
|
let lsn = format!("{:X}/{:>08X}", (start_pos >> 32) as u32, start_pos as u32);
|
|
let tli = timeline.to_string();
|
|
let sysid = self.timeline().get_info().server.system_id.to_string();
|
|
let lsn_bytes = lsn.as_bytes();
|
|
let tli_bytes = tli.as_bytes();
|
|
let sysid_bytes = sysid.as_bytes();
|
|
|
|
BeMessage::write(
|
|
&mut self.outbuf,
|
|
&BeMessage::RowDescription(&[
|
|
RowDescriptor {
|
|
name: b"systemid\0",
|
|
typoid: 25,
|
|
typlen: -1,
|
|
},
|
|
RowDescriptor {
|
|
name: b"timeline\0",
|
|
typoid: 23,
|
|
typlen: 4,
|
|
},
|
|
RowDescriptor {
|
|
name: b"xlogpos\0",
|
|
typoid: 25,
|
|
typlen: -1,
|
|
},
|
|
RowDescriptor {
|
|
name: b"dbname\0",
|
|
typoid: 25,
|
|
typlen: -1,
|
|
},
|
|
]),
|
|
);
|
|
BeMessage::write(
|
|
&mut self.outbuf,
|
|
&BeMessage::DataRow(&[Some(sysid_bytes), Some(tli_bytes), Some(lsn_bytes), None]),
|
|
);
|
|
BeMessage::write(
|
|
&mut self.outbuf,
|
|
&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM\0"),
|
|
);
|
|
BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery);
|
|
self.send()?;
|
|
Ok(true)
|
|
}
|
|
|
|
//
|
|
// Handle START_REPLICATION replication command
|
|
//
|
|
fn handle_start_replication(&mut self, cmd: &Bytes) -> Result<bool> {
|
|
let re = Regex::new(r"([[:xdigit:]]*)/([[:xdigit:]]*)").unwrap();
|
|
let mut caps = re.captures_iter(str::from_utf8(&cmd[..]).unwrap());
|
|
let cap = caps.next().unwrap();
|
|
let mut start_pos: XLogRecPtr = (parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])?;
|
|
let mut stop_pos: XLogRecPtr = if let Some(cap) = caps.next() {
|
|
(parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])?
|
|
} else {
|
|
0
|
|
};
|
|
let wal_seg_size = self.timeline().get_info().server.wal_seg_size as usize;
|
|
if wal_seg_size == 0 {
|
|
io_error!("Can not start replication before connecting to wal_proposer");
|
|
}
|
|
let (wal_end, timeline) = self.find_end_of_wal(false);
|
|
if start_pos == 0 {
|
|
start_pos = wal_end;
|
|
}
|
|
if stop_pos == 0 && self.appname == Some("wal_proposer_recovery".to_string()) {
|
|
stop_pos = wal_end;
|
|
}
|
|
info!(
|
|
"Start replication from {:X}/{:>08X} till {:X}/{:>08X}",
|
|
(start_pos >> 32) as u32,
|
|
start_pos as u32,
|
|
(stop_pos >> 32) as u32,
|
|
stop_pos as u32
|
|
);
|
|
BeMessage::write(&mut self.outbuf, &BeMessage::Copy);
|
|
self.send()?;
|
|
|
|
let mut end_pos: XLogRecPtr;
|
|
let mut commit_lsn: XLogRecPtr;
|
|
let mut wal_file: Option<File> = None;
|
|
self.outbuf
|
|
.resize(LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + MAX_SEND_SIZE, 0u8);
|
|
loop {
|
|
/* Wait until we have some data to stream */
|
|
if stop_pos != 0 {
|
|
/* recovery mode: stream up to the specified LSN (VCL) */
|
|
if start_pos >= stop_pos {
|
|
/* recovery finished */
|
|
break;
|
|
}
|
|
end_pos = stop_pos;
|
|
} else {
|
|
/* normal mode */
|
|
let timeline = self.timeline();
|
|
let mut shared_state = timeline.mutex.lock().unwrap();
|
|
loop {
|
|
commit_lsn = shared_state.commit_lsn;
|
|
if start_pos < commit_lsn {
|
|
end_pos = commit_lsn;
|
|
break;
|
|
}
|
|
shared_state = timeline.cond.wait(shared_state).unwrap();
|
|
}
|
|
}
|
|
if end_pos == END_REPLICATION_MARKER {
|
|
break;
|
|
}
|
|
// Try to fetch replica's feedback
|
|
|
|
// Temporarily set this stream into nonblocking mode.
|
|
// FIXME: This seems like a dirty hack.
|
|
// Should this task be done on a background thread?
|
|
self.stream.set_nonblocking(true).unwrap();
|
|
let read_result = self.stream.read(&mut self.inbuf);
|
|
self.stream.set_nonblocking(false).unwrap();
|
|
|
|
match read_result {
|
|
Ok(0) => break,
|
|
Ok(_) => {
|
|
if let Some(FeMessage::CopyData(m)) = self.parse_message()? {
|
|
self.timeline()
|
|
.add_hs_feedback(HotStandbyFeedback::parse(&m.body))
|
|
}
|
|
}
|
|
Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {}
|
|
Err(e) => {
|
|
return Err(e);
|
|
}
|
|
}
|
|
|
|
/* Open file if not opened yet */
|
|
let curr_file = wal_file.take();
|
|
let mut file: File;
|
|
if let Some(opened_file) = curr_file {
|
|
file = opened_file;
|
|
} else {
|
|
let segno = XLByteToSeg(start_pos, wal_seg_size);
|
|
let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
|
|
let wal_file_path = self
|
|
.conf
|
|
.data_dir
|
|
.join(self.timeline().timelineid.to_string())
|
|
.join(wal_file_name.clone() + ".partial");
|
|
if let Ok(opened_file) = File::open(&wal_file_path) {
|
|
file = opened_file;
|
|
} else {
|
|
let wal_file_path = self
|
|
.conf
|
|
.data_dir
|
|
.join(self.timeline().timelineid.to_string())
|
|
.join(wal_file_name);
|
|
match File::open(&wal_file_path) {
|
|
Ok(opened_file) => file = opened_file,
|
|
Err(e) => {
|
|
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
|
return Err(e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
let xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
|
|
|
// How much to read and send in message? We cannot cross the WAL file
|
|
// boundary, and we don't want send more than MAX_SEND_SIZE.
|
|
let send_size = (end_pos - start_pos) as usize;
|
|
let send_size = min(send_size, wal_seg_size - xlogoff);
|
|
let send_size = min(send_size, MAX_SEND_SIZE);
|
|
|
|
let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size;
|
|
let data_start = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE;
|
|
let data_end = data_start + send_size;
|
|
|
|
file.seek(SeekFrom::Start(xlogoff as u64))?;
|
|
file.read_exact(&mut self.outbuf[data_start..data_end])?;
|
|
self.outbuf[0] = b'd';
|
|
BigEndian::write_u32(
|
|
&mut self.outbuf[1..5],
|
|
(msg_size - LIBPQ_MSG_SIZE_OFFS) as u32,
|
|
);
|
|
self.outbuf[5] = b'w';
|
|
BigEndian::write_u64(&mut self.outbuf[6..14], start_pos);
|
|
BigEndian::write_u64(&mut self.outbuf[14..22], end_pos);
|
|
BigEndian::write_u64(&mut self.outbuf[22..30], get_current_timestamp());
|
|
|
|
self.stream.write_all(&self.outbuf[0..msg_size])?;
|
|
start_pos += send_size as u64;
|
|
|
|
debug!(
|
|
"Sent WAL to page server up to {:X}/{:>08X}",
|
|
(end_pos >> 32) as u32,
|
|
end_pos as u32
|
|
);
|
|
|
|
if XLogSegmentOffset(start_pos, wal_seg_size) != 0 {
|
|
wal_file = Some(file);
|
|
}
|
|
}
|
|
Ok(false)
|
|
}
|
|
|
|
fn process_query(&mut self, q: &FeQueryMessage) -> Result<bool> {
|
|
trace!("got query {:?}", q.body);
|
|
|
|
if q.body.starts_with(b"IDENTIFY_SYSTEM") {
|
|
self.handle_identify_system()
|
|
} else if q.body.starts_with(b"START_REPLICATION") {
|
|
self.handle_start_replication(&q.body)
|
|
} else {
|
|
io_error!("Unexpected command {:?}", q.body);
|
|
}
|
|
}
|
|
|
|
fn write_wal_file(
|
|
&self,
|
|
startpos: XLogRecPtr,
|
|
timeline: TimeLineID,
|
|
wal_seg_size: usize,
|
|
buf: &[u8],
|
|
) -> Result<()> {
|
|
let mut bytes_left: usize = buf.len();
|
|
let mut bytes_written: usize = 0;
|
|
let mut partial;
|
|
let mut start_pos = startpos;
|
|
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
|
|
|
/* Extract WAL location for this block */
|
|
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
|
|
|
while bytes_left != 0 {
|
|
let bytes_to_write;
|
|
|
|
/*
|
|
* If crossing a WAL boundary, only write up until we reach wal
|
|
* segment size.
|
|
*/
|
|
if xlogoff + bytes_left > wal_seg_size {
|
|
bytes_to_write = wal_seg_size - xlogoff;
|
|
} else {
|
|
bytes_to_write = bytes_left;
|
|
}
|
|
|
|
/* Open file */
|
|
let segno = XLByteToSeg(start_pos, wal_seg_size);
|
|
let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
|
|
let wal_file_path = self
|
|
.conf
|
|
.data_dir
|
|
.join(self.timeline().timelineid.to_string())
|
|
.join(wal_file_name.clone());
|
|
let wal_file_partial_path = self
|
|
.conf
|
|
.data_dir
|
|
.join(self.timeline().timelineid.to_string())
|
|
.join(wal_file_name.clone() + ".partial");
|
|
|
|
{
|
|
let mut wal_file: File;
|
|
/* Try to open already completed segment */
|
|
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
|
wal_file = file;
|
|
partial = false;
|
|
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path)
|
|
{
|
|
/* Try to open existed partial file */
|
|
wal_file = file;
|
|
partial = true;
|
|
} else {
|
|
/* Create and fill new partial file */
|
|
partial = true;
|
|
match OpenOptions::new()
|
|
.create(true)
|
|
.write(true)
|
|
.open(&wal_file_partial_path)
|
|
{
|
|
Ok(mut file) => {
|
|
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
|
file.write_all(&ZERO_BLOCK)?;
|
|
}
|
|
wal_file = file;
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
|
return Err(e);
|
|
}
|
|
}
|
|
}
|
|
wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
|
|
wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
|
|
|
|
// Flush file is not prohibited
|
|
if !self.conf.no_sync {
|
|
wal_file.sync_all()?;
|
|
}
|
|
}
|
|
/* Write was successful, advance our position */
|
|
bytes_written += bytes_to_write;
|
|
bytes_left -= bytes_to_write;
|
|
start_pos += bytes_to_write as u64;
|
|
xlogoff += bytes_to_write;
|
|
|
|
/* Did we reach the end of a WAL segment? */
|
|
if XLogSegmentOffset(start_pos, wal_seg_size) == 0 {
|
|
xlogoff = 0;
|
|
if partial {
|
|
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
|
}
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
// Find last WAL record. If "precise" is false then just locatelast partial segment
|
|
fn find_end_of_wal(&self, precise: bool) -> (XLogRecPtr, TimeLineID) {
|
|
find_end_of_wal(
|
|
&self.conf.data_dir,
|
|
self.timeline().get_info().server.wal_seg_size as usize,
|
|
precise,
|
|
)
|
|
}
|
|
}
|