From f45cf28247cad5fc1d813881d4869efd10f1fef2 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Fri, 21 Jun 2024 13:15:02 +0100 Subject: [PATCH] Add eviction_state to control file (#8125) This is a preparation for #8022, to make the PR both backwards and foward compatible. This commit adds `eviction_state` field to control file. Adds support for reading it, but writes control file in old format where possible, to keep the disk format forward compatible. Note: in `patch_control_file`, new field gets serialized to json like this: - `"eviction_state": "Present"` - `"eviction_state": {"Offloaded": "0/8F"}` --- safekeeper/src/control_file.rs | 19 ++++-- safekeeper/src/control_file_upgrade.rs | 94 +++++++++++++++++++++++++- safekeeper/src/safekeeper.rs | 5 +- safekeeper/src/state.rs | 16 +++++ 4 files changed, 128 insertions(+), 6 deletions(-) diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 9d65187350..8e9031fae4 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -12,15 +12,16 @@ use std::ops::Deref; use std::path::Path; use std::time::Instant; +use crate::control_file_upgrade::downgrade_v9_to_v8; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; -use crate::state::TimelinePersistentState; +use crate::state::{EvictionState, TimelinePersistentState}; use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 8; +pub const SK_FORMAT_VERSION: u32 = 9; // contains persistent metadata for safekeeper pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; @@ -178,8 +179,18 @@ impl Storage for FileStorage { })?; let mut buf: Vec = Vec::new(); WriteBytesExt::write_u32::(&mut buf, SK_MAGIC)?; - WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; - s.ser_into(&mut buf)?; + + if s.eviction_state == EvictionState::Present { + // temp hack for forward compatibility + const PREV_FORMAT_VERSION: u32 = 8; + let prev = downgrade_v9_to_v8(s); + WriteBytesExt::write_u32::(&mut buf, PREV_FORMAT_VERSION)?; + prev.ser_into(&mut buf)?; + } else { + // otherwise, we write the current format version + WriteBytesExt::write_u32::(&mut buf, SK_FORMAT_VERSION)?; + s.ser_into(&mut buf)?; + } // calculate checksum before resize let checksum = crc32c::crc32c(&buf); diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 8f4dfe9b43..a4b4670e42 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,7 +1,7 @@ //! Code to deal with safekeeper control file upgrades use crate::{ safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, - state::{PersistedPeers, TimelinePersistentState}, + state::{EvictionState, PersistedPeers, TimelinePersistentState}, wal_backup_partial, }; use anyhow::{bail, Result}; @@ -183,6 +183,55 @@ pub struct SafeKeeperStateV7 { pub peers: PersistedPeers, } +/// Persistent information stored on safekeeper node about timeline. +/// On disk data is prefixed by magic and format version and followed by checksum. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafeKeeperStateV8 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). + pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -213,6 +262,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result Result SafeKeeperStateV8 { + assert!(state.eviction_state == EvictionState::Present); + SafeKeeperStateV8 { + tenant_id: state.tenant_id, + timeline_id: state.timeline_id, + acceptor_state: state.acceptor_state.clone(), + server: state.server.clone(), + proposer_uuid: state.proposer_uuid, + timeline_start_lsn: state.timeline_start_lsn, + local_start_lsn: state.local_start_lsn, + commit_lsn: state.commit_lsn, + backup_lsn: state.backup_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, + remote_consistent_lsn: state.remote_consistent_lsn, + peers: state.peers.clone(), + partial_backup: state.partial_backup.clone(), + } +} + #[cfg(test)] mod tests { use std::str::FromStr; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index ae230960ae..666ffdf0ce 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -958,7 +958,7 @@ mod tests { use super::*; use crate::{ - state::{PersistedPeers, TimelinePersistentState}, + state::{EvictionState, PersistedPeers, TimelinePersistentState}, wal_storage::Storage, }; use std::{ops::Deref, str::FromStr, time::Instant}; @@ -1225,6 +1225,7 @@ mod tests { }, )]), partial_backup: crate::wal_backup_partial::State::default(), + eviction_state: EvictionState::Present, }; let ser = state.ser().unwrap(); @@ -1272,6 +1273,8 @@ mod tests { 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, // partial_backup 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + // eviction_state + 0x00, 0x00, 0x00, 0x00, ]; assert_eq!(Hex(&ser), Hex(&expected)); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index be5e516296..e0f7b65aef 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -63,11 +63,26 @@ pub struct TimelinePersistentState { /// Holds names of partial segments uploaded to remote storage. Used to /// clean up old objects without leaving garbage in remote storage. pub partial_backup: wal_backup_partial::State, + /// Eviction state of the timeline. If it's Offloaded, we should download + /// WAL files from remote storage to serve the timeline. + pub eviction_state: EvictionState, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); +/// State of the local WAL files. Used to track current timeline state, +/// that can be either WAL files are present on disk or last partial segment +/// is offloaded to remote storage. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)] +pub enum EvictionState { + /// WAL files are present on disk. + Present, + /// Last partial segment is offloaded to remote storage. + /// Contains flush_lsn of the last offloaded segment. + Offloaded(Lsn), +} + impl TimelinePersistentState { pub fn new( ttid: &TenantTimelineId, @@ -98,6 +113,7 @@ impl TimelinePersistentState { .collect(), ), partial_backup: wal_backup_partial::State::default(), + eviction_state: EvictionState::Present, } }