// // This file contains common utilities for dealing with PostgreSQL WAL files and // LSNs. // // Many of these functions have been copied from PostgreSQL, and rewritten in // Rust. That's why they don't follow the usual Rust naming conventions, they // have been named the same as the corresponding PostgreSQL functions instead. // use crate::pg_constants; use crate::CheckPoint; use crate::ControlFileData; use crate::FullTransactionId; use crate::XLogLongPageHeaderData; use crate::XLogPageHeaderData; use crate::XLogRecord; use crate::XLOG_PAGE_MAGIC; use byteorder::{ByteOrder, LittleEndian}; use bytes::{Buf, Bytes}; use bytes::{BufMut, BytesMut}; use crc32c::*; use log::*; use std::cmp::min; use std::fs::{self, File}; use std::io::prelude::*; use std::path::{Path, PathBuf}; use std::time::SystemTime; pub const XLOG_FNAME_LEN: usize = 24; pub const XLOG_BLCKSZ: usize = 8192; pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8; pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2; pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = std::mem::size_of::(); pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = std::mem::size_of::(); pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::(); pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2; pub type XLogRecPtr = u64; pub type TimeLineID = u32; pub type TimestampTz = i64; pub type XLogSegNo = u64; const XID_CHECKPOINT_INTERVAL: u32 = 1024; #[allow(non_snake_case)] pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo { (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo } #[allow(non_snake_case)] pub fn XLogSegNoOffsetToRecPtr( segno: XLogSegNo, offset: u32, wal_segsz_bytes: usize, ) -> XLogRecPtr { segno * (wal_segsz_bytes as u64) + (offset as u64) } #[allow(non_snake_case)] pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String { return format!( "{:>08X}{:>08X}{:>08X}", tli, logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes), logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes) ); } #[allow(non_snake_case)] pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) { let tli = u32::from_str_radix(&fname[0..8], 16).unwrap(); let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo; let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo; (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli) } #[allow(non_snake_case)] pub fn IsXLogFileName(fname: &str) -> bool { return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit()); } #[allow(non_snake_case)] pub fn IsPartialXLogFileName(fname: &str) -> bool { fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) } pub fn get_current_timestamp() -> TimestampTz { const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */ const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */ const SECS_PER_DAY: u64 = 86400; const USECS_PER_SEC: u64 = 1000000; match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) { Ok(n) => { ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)) * USECS_PER_SEC + n.subsec_micros() as u64) as i64 } Err(_) => panic!("SystemTime before UNIX EPOCH!"), } } fn find_end_of_wal_segment( data_dir: &Path, segno: XLogSegNo, tli: TimeLineID, wal_seg_size: usize, ) -> u32 { let mut offs: usize = 0; let mut contlen: usize = 0; let mut wal_crc: u32 = 0; let mut crc: u32 = 0; let mut rec_offs: usize = 0; let mut buf = [0u8; XLOG_BLCKSZ]; let file_name = XLogFileName(tli, segno, wal_seg_size); let mut last_valid_rec_pos: usize = 0; let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; while offs < wal_seg_size { // we are at the beginning of the page; read it in if offs % XLOG_BLCKSZ == 0 { if let Ok(bytes_read) = file.read(&mut buf) { if bytes_read != buf.len() { break; } } else { break; } let xlp_magic = LittleEndian::read_u16(&buf[0..2]); let xlp_info = LittleEndian::read_u16(&buf[2..4]); let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]); if xlp_magic != XLOG_PAGE_MAGIC as u16 { info!("Invalid WAL file {}.partial magic {}", file_name, xlp_magic); break; } if offs == 0 { offs = XLOG_SIZE_OF_XLOG_LONG_PHD; if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 { offs += ((xlp_rem_len + 7) & !7) as usize; } } else { offs += XLOG_SIZE_OF_XLOG_SHORT_PHD; } // beginning of the next record } else if contlen == 0 { let page_offs = offs % XLOG_BLCKSZ; let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize; if xl_tot_len == 0 { break; // zeros, reached the end } last_valid_rec_pos = offs; offs += 4; rec_offs = 4; contlen = xl_tot_len - 4; rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]); } else { // we're continuing a record, possibly from previous page. let page_offs = offs % XLOG_BLCKSZ; let pageleft = XLOG_BLCKSZ - page_offs; // read the rest of the record, or as much as fits on this page. let n = min(contlen, pageleft); // fill rec_hdr (header up to (but not including) xl_crc field) if rec_offs < XLOG_RECORD_CRC_OFFS { let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n); rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]); } if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD { let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS; wal_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]); crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]); crc = !crc; } else { crc ^= 0xFFFFFFFFu32; crc = crc32c_append(crc, &buf[page_offs..page_offs + n]); crc = !crc; } rec_offs += n; offs += n; contlen -= n; if contlen == 0 { crc = !crc; crc = crc32c_append(crc, &rec_hdr); offs = (offs + 7) & !7; // pad on 8 bytes boundary */ if crc == wal_crc { // record is valid, advance the result to its end (with // alignment to the next record taken into account) last_valid_rec_pos = offs; } else { info!( "CRC mismatch {} vs {} at {}", crc, wal_crc, last_valid_rec_pos ); break; } } } } last_valid_rec_pos as u32 } /// /// Scan a directory that contains PostgreSQL WAL files, for the end of WAL. /// If precise, returns end LSN (next insertion point, basically); /// otherwise, start of the last segment. /// Returns (0, 0) if there is no WAL. /// pub fn find_end_of_wal( data_dir: &Path, wal_seg_size: usize, precise: bool, ) -> (XLogRecPtr, TimeLineID) { let mut high_segno: XLogSegNo = 0; let mut high_tli: TimeLineID = 0; let mut high_ispartial = false; for entry in fs::read_dir(data_dir).unwrap().flatten() { let ispartial: bool; let entry_name = entry.file_name(); let fname = entry_name.to_str().unwrap(); /* * Check if the filename looks like an xlog file, or a .partial file. */ if IsXLogFileName(fname) { ispartial = false; } else if IsPartialXLogFileName(fname) { ispartial = true; } else { continue; } let (segno, tli) = XLogFromFileName(fname, wal_seg_size); if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 { continue; } if segno > high_segno || (segno == high_segno && tli > high_tli) || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial) { high_segno = segno; high_tli = tli; high_ispartial = ispartial; } } if high_segno > 0 { let mut high_offs = 0; /* * Move the starting pointer to the start of the next segment, if the * highest one we saw was completed. */ if !high_ispartial { high_segno += 1; } else if precise { /* otherwise locate last record in last partial segment */ high_offs = find_end_of_wal_segment(data_dir, high_segno, high_tli, wal_seg_size); } let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size); return (high_ptr, high_tli); } (0, 0) } pub fn main() { let mut data_dir = PathBuf::new(); data_dir.push("."); let wal_seg_size = 16 * 1024 * 1024; let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true); println!( "wal_end={:>08X}{:>08X}, tli={}", (wal_end >> 32) as u32, wal_end as u32, tli ); } impl XLogRecord { pub fn from_bytes(buf: &mut Bytes) -> XLogRecord { use zenith_utils::bin_ser::LeSer; XLogRecord::des_from(&mut buf.reader()).unwrap() } pub fn encode(&self) -> Bytes { use zenith_utils::bin_ser::LeSer; self.ser().unwrap().into() } // Is this record an XLOG_SWITCH record? They need some special processing, pub fn is_xlog_switch_record(&self) -> bool { self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID } } impl XLogPageHeaderData { pub fn from_bytes(buf: &mut B) -> XLogPageHeaderData { use zenith_utils::bin_ser::LeSer; XLogPageHeaderData::des_from(&mut buf.reader()).unwrap() } } impl XLogLongPageHeaderData { pub fn from_bytes(buf: &mut B) -> XLogLongPageHeaderData { use zenith_utils::bin_ser::LeSer; XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap() } pub fn encode(&self) -> Bytes { use zenith_utils::bin_ser::LeSer; self.ser().unwrap().into() } } pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::(); impl CheckPoint { pub fn encode(&self) -> Bytes { use zenith_utils::bin_ser::LeSer; self.ser().unwrap().into() } pub fn decode(buf: &[u8]) -> Result { use zenith_utils::bin_ser::LeSer; Ok(CheckPoint::des(buf)?) } // Update next XID based on provided new_xid and stored epoch. // Next XID should be greater than new_xid. // Also take in account 32-bit wrap-around. pub fn update_next_xid(&mut self, xid: u32) { let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1); let full_xid = self.nextXid.value; let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID); let old_xid = full_xid as u32; if new_xid.wrapping_sub(old_xid) as i32 > 0 { let mut epoch = full_xid >> 32; if new_xid < old_xid { // wrap-around epoch += 1; } self.nextXid = FullTransactionId { value: (epoch << 32) | new_xid as u64, }; } } } // // Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record. // We need this segment to start compute node. // In order to minimize changes in Postgres core, we prefer to // provide WAL segment from which is can extract checkpoint record in standard way, // rather then implement some alternative mechanism. // pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes { let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize); let hdr = XLogLongPageHeaderData { std: { XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, xlp_info: pg_constants::XLP_LONG_HEADER, xlp_tli: 1, // FIXME: always use Postgres timeline 1 xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64, xlp_rem_len: 0, ..Default::default() // Put 0 in padding fields. } }, xlp_sysid: pg_control.system_identifier, xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32, xlp_xlog_blcksz: XLOG_BLCKSZ as u32, }; let hdr_bytes = hdr.encode(); seg_buf.extend_from_slice(&hdr_bytes); let rec_hdr = XLogRecord { xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT + SIZEOF_CHECKPOINT) as u32, xl_xid: 0, //0 is for InvalidTransactionId xl_prev: 0, xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN, xl_rmid: pg_constants::RM_XLOG_ID, xl_crc: 0, ..Default::default() // Put 0 in padding fields. }; let mut rec_shord_hdr_bytes = BytesMut::new(); rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT); rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8); let rec_bytes = rec_hdr.encode(); let checkpoint_bytes = pg_control.checkPointCopy.encode(); //calculate record checksum let mut crc = 0; crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]); crc = crc32c_append(crc, &checkpoint_bytes[..]); crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]); seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]); seg_buf.put_u32_le(crc); seg_buf.extend_from_slice(&rec_shord_hdr_bytes); seg_buf.extend_from_slice(&checkpoint_bytes); //zero out the rest of the file seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0); seg_buf.freeze() } #[cfg(test)] mod tests { use super::*; use regex::Regex; use std::{env, process::Command, str::FromStr}; use zenith_utils::lsn::Lsn; // Run find_end_of_wal against file in test_wal dir // Ensure that it finds last record correctly #[test] pub fn test_find_end_of_wal() { // 1. Run initdb to generate some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(".."); let data_dir = top_path.join("test_output/test_find_end_of_wal"); let initdb_path = top_path.join("tmp_install/bin/initdb"); let lib_path = top_path.join("tmp_install/lib"); if data_dir.exists() { fs::remove_dir_all(&data_dir).unwrap(); } println!("Using initdb from '{}'", initdb_path.display()); println!("Data directory '{}'", data_dir.display()); let initdb_output = Command::new(initdb_path) .args(&["-D", data_dir.to_str().unwrap()]) .arg("--no-instructions") .arg("--no-sync") .env_clear() .env("LD_LIBRARY_PATH", &lib_path) .env("DYLD_LIBRARY_PATH", &lib_path) .output() .unwrap(); assert!(initdb_output.status.success()); // 2. Pick WAL generated by initdb let wal_dir = data_dir.join("pg_wal"); let wal_seg_size = 16 * 1024 * 1024; // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated) let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true); let wal_end = Lsn(wal_end); println!("wal_end={}, tli={}", wal_end, tli); assert_eq!(wal_end, "0/2000000".parse::().unwrap()); // 4. Get the actual end of WAL by pg_waldump let waldump_path = top_path.join("tmp_install/bin/pg_waldump"); let waldump_output = Command::new(waldump_path) .arg(wal_dir.join("000000010000000000000001")) .env_clear() .env("LD_LIBRARY_PATH", &lib_path) .env("DYLD_LIBRARY_PATH", &lib_path) .output() .unwrap(); let waldump_output = std::str::from_utf8(&waldump_output.stderr).unwrap(); println!("waldump_output = '{}'", &waldump_output); let re = Regex::new(r"invalid record length at (.+):").unwrap(); let caps = re.captures(&waldump_output).unwrap(); let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); // 5. Rename file to partial to actually find last valid lsn fs::rename( wal_dir.join("000000010000000000000001"), wal_dir.join("000000010000000000000001.partial"), ) .unwrap(); let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true); let wal_end = Lsn(wal_end); println!("wal_end={}, tli={}", wal_end, tli); assert_eq!(wal_end, waldump_wal_end); } }