Remove complicated state management

This commit is contained in:
Arthur Petukhovsky
2022-09-09 12:43:54 +00:00
parent 0faf0e92ec
commit f8aecd53cd
13 changed files with 615 additions and 502 deletions

View File

@@ -298,7 +298,9 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
let signals = signals::install_shutdown_handlers()?;
let mut threads = vec![];
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx);
// Load all timelines from disk to memory.
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
let conf_ = conf.clone();
threads.push(

View File

@@ -235,21 +235,15 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
// Push data concurrently to not suffer from latency, with many timelines it can be slow.
let handles = active_tlis
.iter()
.filter_map(|tli| {
let sk_info = tli.get_public_info(&conf).ok()?;
.map(|tli| {
let sk_info = tli.get_public_info(&conf);
let key = timeline_safekeeper_path(
conf.broker_etcd_prefix.clone(),
tli.zttid,
conf.my_id,
);
let lease = leases.remove(&tli.zttid).unwrap();
Some(tokio::spawn(push_sk_info(
tli.zttid,
client.clone(),
key,
sk_info,
lease,
)))
tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease))
})
.collect::<Vec<_>>();
for h in handles {
@@ -290,10 +284,10 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
match subscription.value_updates.recv().await {
Some(new_info) => {
// note: there are blocking operations below, but it's considered fine for now
let tli = GlobalTimelines::get(new_info.key.id);
let _ = tli
.record_safekeeper_info(&new_info.value, new_info.key.node_id)
.await;
if let Ok(tli) = GlobalTimelines::get(new_info.key.id) {
tli.record_safekeeper_info(&new_info.value, new_info.key.node_id)
.await?
}
}
None => {
// XXX it means we lost connection with etcd, error is consumed inside sub object

View File

@@ -69,6 +69,7 @@ impl FileStorage {
})
}
/// Create file storage for a new timeline, but don't persist it yet.
pub fn create_new(
zttid: &ZTenantTimelineId,
conf: &SafeKeeperConf,
@@ -78,19 +79,18 @@ impl FileStorage {
let tenant_id = zttid.tenant_id.to_string();
let timeline_id = zttid.timeline_id.to_string();
let mut store = FileStorage {
let store = FileStorage {
timeline_dir,
conf: conf.clone(),
persist_control_file_seconds: PERSIST_CONTROL_FILE_SECONDS
.with_label_values(&[&tenant_id, &timeline_id]),
state: state.clone(),
state,
};
store.persist(&state)?;
Ok(store)
}
// Check the magic/version in the on-disk data and deserialize it, if possible.
/// Check the magic/version in the on-disk data and deserialize it, if possible.
fn deser_sk_state(buf: &mut &[u8]) -> Result<SafeKeeperState> {
// Read the version independent part
let magic = buf.read_u32::<LittleEndian>()?;
@@ -110,7 +110,7 @@ impl FileStorage {
upgrade_control_file(buf, version)
}
// Load control file for given zttid at path specified by conf.
/// Load control file for given zttid at path specified by conf.
pub fn load_control_file_conf(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
@@ -171,8 +171,8 @@ impl Deref for FileStorage {
}
impl Storage for FileStorage {
// persists state durably to underlying storage
// for description see https://lwn.net/Articles/457667/
/// persists state durably to underlying storage
/// for description see https://lwn.net/Articles/457667/
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
let _timer = &self.persist_control_file_seconds.start_timer();

View File

@@ -3,15 +3,15 @@
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
use crate::receive_wal::ReceiveWalConn;
use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage};
use crate::send_wal::ReplicationConn;
use crate::timeline::Timeline;
use crate::{GlobalTimelines, SafeKeeperConf};
use anyhow::{bail, Context, Result};
use postgres_ffi::PG_TLI;
use regex::Regex;
use std::sync::Arc;
use tracing::info;
use utils::{
lsn::Lsn,
@@ -27,7 +27,7 @@ pub struct SafekeeperPostgresHandler {
pub appname: Option<String>,
pub ztenantid: Option<ZTenantId>,
pub ztimelineid: Option<ZTimelineId>,
pub timeline: Option<Arc<Timeline>>,
pub zttid: ZTenantTimelineId,
}
/// Parsed Postgres command.
@@ -100,11 +100,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
let tenantid = self.ztenantid.context("tenantid is required")?;
let timelineid = self.ztimelineid.context("timelineid is required")?;
if self.timeline.is_none() {
self.timeline = Some(GlobalTimelines::get(ZTenantTimelineId::new(
tenantid, timelineid,
)));
}
self.zttid = ZTenantTimelineId::new(tenantid, timelineid);
match cmd {
SafekeeperPostgresCommand::StartWalPush => ReceiveWalConn::new(pgb)
@@ -129,38 +125,26 @@ impl SafekeeperPostgresHandler {
appname: None,
ztenantid: None,
ztimelineid: None,
timeline: None,
zttid: ZTenantTimelineId::empty(),
}
}
/// Shortcut for calling `process_msg` in the timeline.
pub fn process_safekeeper_msg(
&self,
msg: &ProposerAcceptorMessage,
) -> Result<Option<AcceptorProposerMessage>> {
self.timeline
.as_ref()
.unwrap()
.process_msg(msg)
.context("failed to process ProposerAcceptorMessage")
}
///
/// Handle IDENTIFY_SYSTEM replication command
///
fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> {
let tli = self.timeline.as_ref().unwrap();
let tli = GlobalTimelines::get(self.zttid)?;
let lsn = if self.is_walproposer_recovery() {
// walproposer should get all local WAL until flush_lsn
tli.get_flush_lsn()?
tli.get_flush_lsn()
} else {
// other clients shouldn't get any uncommitted WAL
tli.get_state()?.0.commit_lsn
tli.get_state().0.commit_lsn
}
.to_string();
let sysid = tli.get_state()?.1.server.system_id.to_string();
let sysid = tli.get_state().1.server.system_id.to_string();
let lsn_bytes = lsn.as_bytes();
let tli = PG_TLI.to_string();
let tli_bytes = tli.as_bytes();

View File

@@ -10,8 +10,8 @@ use std::sync::Arc;
use crate::safekeeper::Term;
use crate::safekeeper::TermHistory;
use crate::timeline;
use crate::timeline::TimelineDeleteForceResult;
use crate::timelines_global_map::TimelineDeleteForceResult;
use crate::GlobalTimelines;
use crate::SafeKeeperConf;
use etcd_broker::subscription_value::SkTimelineInfo;
@@ -99,9 +99,9 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
);
check_permission(&request, Some(zttid.tenant_id))?;
let tli = GlobalTimelines::get(zttid);
let (inmem, state) = tli.get_state()?;
let flush_lsn = tli.get_flush_lsn()?;
let tli = GlobalTimelines::get(zttid)?;
let (inmem, state) = tli.get_state();
let flush_lsn = tli.get_flush_lsn();
let acc_state = AcceptorStateStatus {
term: state.acceptor_state.term,
@@ -138,12 +138,6 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
}
/// Deactivates the timeline and removes its data directory.
///
/// It does not try to stop any processing of the timeline; there is no such code at the time of writing.
/// However, it tries to check whether the timeline was active and report it to caller just in case.
/// Note that this information is inaccurate:
/// 1. There is a race condition between checking the timeline for activity and actual directory deletion.
/// 2. At the time of writing Safekeeper rarely marks a timeline inactive. E.g. disconnecting the compute node does nothing.
async fn timeline_delete_force_handler(
mut request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
@@ -153,12 +147,8 @@ async fn timeline_delete_force_handler(
);
check_permission(&request, Some(zttid.tenant_id))?;
ensure_no_body(&mut request).await?;
json_response(
StatusCode::OK,
timeline::delete_force(get_conf(&request), &zttid)
.await
.map_err(ApiError::from_err)?,
)
let resp = GlobalTimelines::delete_force(&zttid).map_err(ApiError::from_err)?;
json_response(StatusCode::OK, resp)
}
/// Deactivates all timelines for the tenant and removes its data directory.
@@ -171,8 +161,7 @@ async fn tenant_delete_force_handler(
ensure_no_body(&mut request).await?;
json_response(
StatusCode::OK,
timeline::delete_force_all_for_tenant(get_conf(&request), &tenant_id)
.await
GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
.map_err(ApiError::from_err)?
.iter()
.map(|(zttid, resp)| (format!("{}", zttid.timeline_id), *resp))
@@ -189,7 +178,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
check_permission(&request, Some(zttid.tenant_id))?;
let safekeeper_info: SkTimelineInfo = json_request(&mut request).await?;
let tli = GlobalTimelines::get(zttid);
let tli = GlobalTimelines::get(zttid)?;
tli.record_safekeeper_info(&safekeeper_info, NodeId(1))
.await?;

View File

@@ -6,6 +6,8 @@
//! modifications in tests.
//!
use std::sync::Arc;
use anyhow::Result;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
@@ -17,8 +19,8 @@ use crate::safekeeper::{
AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, ProposerGreeting,
};
use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry};
use crate::timeline::TimelineTools;
use postgres_ffi::v14::pg_constants;
use crate::timeline::Timeline;
use crate::GlobalTimelines;
use postgres_ffi::v14::xlog_utils;
use postgres_ffi::WAL_SEGMENT_SIZE;
use utils::{
@@ -58,23 +60,24 @@ struct AppendResult {
/// content, and then append it with specified term and lsn. This
/// function is used to test safekeepers in different scenarios.
pub fn handle_json_ctrl(
spg: &mut SafekeeperPostgresHandler,
spg: &SafekeeperPostgresHandler,
pgb: &mut PostgresBackend,
append_request: &AppendLogicalMessage,
) -> Result<()> {
info!("JSON_CTRL request: {:?}", append_request);
let tli = GlobalTimelines::get(spg.zttid)?;
// need to init safekeeper state before AppendRequest
prepare_safekeeper(spg)?;
prepare_safekeeper(&tli)?;
// if send_proposer_elected is true, we need to update local history
if append_request.send_proposer_elected {
send_proposer_elected(spg, append_request.term, append_request.epoch_start_lsn)?;
send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?;
}
let inserted_wal = append_logical_message(spg, append_request)?;
let inserted_wal = append_logical_message(&tli, append_request)?;
let response = AppendResult {
state: spg.timeline.as_ref().unwrap().get_state()?.1,
state: tli.get_state().1,
inserted_wal,
};
let response_data = serde_json::to_vec(&response)?;
@@ -92,39 +95,28 @@ pub fn handle_json_ctrl(
/// Prepare safekeeper to process append requests without crashes,
/// by sending ProposerGreeting with default server.wal_seg_size.
fn prepare_safekeeper(spg: &mut SafekeeperPostgresHandler) -> Result<()> {
fn prepare_safekeeper(tli: &Arc<Timeline>) -> Result<()> {
let greeting_request = ProposerAcceptorMessage::Greeting(ProposerGreeting {
protocol_version: 2, // current protocol
pg_version: 0, // unknown
proposer_id: [0u8; 16],
system_id: 0,
ztli: spg.ztimelineid.unwrap(),
tenant_id: spg.ztenantid.unwrap(),
ztli: tli.zttid.timeline_id,
tenant_id: tli.zttid.tenant_id,
tli: 0,
wal_seg_size: WAL_SEGMENT_SIZE as u32, // 16MB, default for tests
});
let response = spg
.timeline
.as_ref()
.unwrap()
.process_msg(&greeting_request)?;
let response = tli.process_msg(&greeting_request)?;
match response {
Some(AcceptorProposerMessage::Greeting(_)) => Ok(()),
_ => anyhow::bail!("not GreetingResponse"),
}
}
fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: Lsn) -> Result<()> {
fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> Result<()> {
// add new term to existing history
let history = spg
.timeline
.as_ref()
.unwrap()
.get_state()?
.1
.acceptor_state
.term_history;
let history = tli.get_state().1.acceptor_state.term_history;
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
let mut history_entries = history.0;
history_entries.push(TermSwitchEntry { term, lsn });
@@ -137,10 +129,7 @@ fn send_proposer_elected(spg: &mut SafekeeperPostgresHandler, term: Term, lsn: L
timeline_start_lsn: lsn,
});
spg.timeline
.as_ref()
.unwrap()
.process_msg(&proposer_elected_request)?;
tli.process_msg(&proposer_elected_request)?;
Ok(())
}
@@ -153,12 +142,9 @@ struct InsertedWAL {
/// Extend local WAL with new LogicalMessage record. To do that,
/// create AppendRequest with new WAL and pass it to safekeeper.
fn append_logical_message(
spg: &mut SafekeeperPostgresHandler,
msg: &AppendLogicalMessage,
) -> Result<InsertedWAL> {
fn append_logical_message(tli: &Arc<Timeline>, msg: &AppendLogicalMessage) -> Result<InsertedWAL> {
let wal_data = xlog_utils::encode_logical_message(&msg.lm_prefix, &msg.lm_message);
let sk_state = spg.timeline.as_ref().unwrap().get_state()?.1;
let sk_state = tli.get_state().1;
let begin_lsn = msg.begin_lsn;
let end_lsn = begin_lsn + wal_data.len() as u64;
@@ -182,11 +168,7 @@ fn append_logical_message(
wal_data: Bytes::from(wal_data),
});
let response = spg
.timeline
.as_ref()
.unwrap()
.process_msg(&append_request)?;
let response = tli.process_msg(&append_request)?;
let append_response = match response {
Some(AcceptorProposerMessage::AppendResponse(resp)) => resp,

View File

@@ -7,7 +7,9 @@ use anyhow::{anyhow, bail, Result};
use bytes::BytesMut;
use tracing::*;
use crate::safekeeper::ServerInfo;
use crate::timeline::Timeline;
use crate::GlobalTimelines;
use std::net::SocketAddr;
use std::sync::mpsc::channel;
@@ -66,15 +68,21 @@ impl<'pg> ReceiveWalConn<'pg> {
// Receive information about server
let next_msg = poll_reader.recv_msg()?;
match next_msg {
let tli = match next_msg {
ProposerAcceptorMessage::Greeting(ref greeting) => {
info!(
"start handshake with wal proposer {} sysid {} timeline {}",
self.peer_addr, greeting.system_id, greeting.tli,
);
let server_info = ServerInfo {
pg_version: greeting.pg_version,
system_id: greeting.system_id,
wal_seg_size: greeting.wal_seg_size,
};
GlobalTimelines::create(spg.zttid, server_info)?
}
_ => bail!("unexpected message {:?} instead of greeting", next_msg),
}
};
let mut next_msg = Some(next_msg);
@@ -87,7 +95,7 @@ impl<'pg> ReceiveWalConn<'pg> {
while let Some(ProposerAcceptorMessage::AppendRequest(append_request)) = next_msg {
let msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
let reply = spg.process_safekeeper_msg(&msg)?;
let reply = tli.process_msg(&msg)?;
if let Some(reply) = reply {
self.write_msg(&reply)?;
}
@@ -96,13 +104,13 @@ impl<'pg> ReceiveWalConn<'pg> {
}
// flush all written WAL to the disk
let reply = spg.process_safekeeper_msg(&ProposerAcceptorMessage::FlushWAL)?;
let reply = tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?;
if let Some(reply) = reply {
self.write_msg(&reply)?;
}
} else if let Some(msg) = next_msg.take() {
// process other message
let reply = spg.process_safekeeper_msg(&msg)?;
let reply = tli.process_msg(&msg)?;
if let Some(reply) = reply {
self.write_msg(&reply)?;
}
@@ -111,9 +119,9 @@ impl<'pg> ReceiveWalConn<'pg> {
// Register the connection and defer unregister. Do that only
// after processing first message, as it sets wal_seg_size,
// wanted by many.
spg.timeline.as_ref().unwrap().on_compute_connect()?;
tli.on_compute_connect()?;
_guard = Some(ComputeConnectionGuard {
timeline: Arc::clone(spg.timeline.as_ref().unwrap()),
timeline: Arc::clone(&tli),
});
first_time_through = false;
}

View File

@@ -530,9 +530,6 @@ where
state.timeline_id
);
}
if state.server.wal_seg_size == 0 {
bail!("Calling SafeKeeper::new with empty wal_seg_size");
}
Ok(SafeKeeper {
global_commit_lsn: state.commit_lsn,
@@ -788,6 +785,11 @@ where
Ok(())
}
/// Persist control file to disk, called only after timeline creation (bootstrap).
pub fn persist(&mut self) -> Result<()> {
self.persist_control_file(self.state.clone())
}
/// Persist in-memory state to the disk, taking other data from state.
fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
state.commit_lsn = self.inmem.commit_lsn;
@@ -938,7 +940,7 @@ where
#[cfg(test)]
mod tests {
use postgres_ffi::v14::pg_constants;
use postgres_ffi::WAL_SEGMENT_SIZE;
use super::*;
use crate::wal_storage::Storage;
@@ -966,7 +968,7 @@ mod tests {
fn test_sk_state() -> SafeKeeperState {
let mut state = SafeKeeperState::empty();
state.server.wal_seg_size = pg_constants::WAL_SEGMENT_SIZE as u32;
state.server.wal_seg_size = WAL_SEGMENT_SIZE as u32;
state.tenant_id = ZTenantId::from([1u8; 16]);
state.timeline_id = ZTimelineId::from([1u8; 16]);
state

View File

@@ -4,6 +4,7 @@
use crate::handler::SafekeeperPostgresHandler;
use crate::timeline::{ReplicaState, Timeline};
use crate::wal_storage::WalReader;
use crate::GlobalTimelines;
use anyhow::{bail, Context, Result};
use bytes::Bytes;
@@ -78,13 +79,7 @@ struct ReplicationConnGuard {
impl Drop for ReplicationConnGuard {
fn drop(&mut self) {
let res = self.timeline.remove_replica(self.replica);
if let Err(e) = res {
warn!(
"Failed to remove replica {} from timeline {}: {}",
self.replica, self.timeline.zttid, e
);
}
self.timeline.remove_replica(self.replica);
}
}
@@ -118,7 +113,7 @@ impl ReplicationConn {
// Note: deserializing is on m[1..] because we skip the tag byte.
state.hs_feedback = HotStandbyFeedback::des(&m[1..])
.context("failed to deserialize HotStandbyFeedback")?;
timeline.update_replica_state(replica_id, state)?;
timeline.update_replica_state(replica_id, state);
}
Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
let _reply = StandbyReply::des(&m[1..])
@@ -140,7 +135,7 @@ impl ReplicationConn {
// This replica is the source of information to resend to compute.
state.pageserver_feedback = Some(reply);
timeline.update_replica_state(replica_id, state)?;
timeline.update_replica_state(replica_id, state);
}
_ => warn!("unexpected message {:?}", msg),
}
@@ -173,7 +168,7 @@ impl ReplicationConn {
) -> Result<()> {
let _enter = info_span!("WAL sender", timeline = %spg.ztimelineid.unwrap()).entered();
let tli = Arc::clone(spg.timeline.as_ref().unwrap());
let tli = GlobalTimelines::get(spg.zttid)?;
// spawn the background thread which receives HotStandbyFeedback messages.
let bg_timeline = Arc::clone(&tli);
@@ -182,7 +177,7 @@ impl ReplicationConn {
let state = ReplicaState::new();
// This replica_id is used below to check if it's time to stop replication.
let replica_id = bg_timeline.add_replica(state)?;
let replica_id = bg_timeline.add_replica(state);
// Use a guard object to remove our entry from the timeline, when the background
// thread and us have both finished using it.
@@ -209,7 +204,7 @@ impl ReplicationConn {
.build()?;
runtime.block_on(async move {
let (inmem_state, persisted_state) = tli.get_state()?;
let (inmem_state, persisted_state) = tli.get_state();
// add persisted_state.timeline_start_lsn == Lsn(0) check
// Walproposer gets special handling: safekeeper must give proposer all
@@ -222,7 +217,7 @@ impl ReplicationConn {
// on this safekeeper itself. That's ok as (old) proposer will never be
// able to commit such WAL.
let stop_pos: Option<Lsn> = if spg.is_walproposer_recovery() {
let wal_end = tli.get_flush_lsn()?;
let wal_end = tli.get_flush_lsn();
Some(wal_end)
} else {
None
@@ -263,7 +258,7 @@ impl ReplicationConn {
} else {
// TODO: also check once in a while whether we are walsender
// to right pageserver.
if tli.should_walsender_stop(replica_id)? {
if tli.should_walsender_stop(replica_id) {
// Shut down, timeline is suspended.
// TODO create proper error type for this
bail!("end streaming to {:?}", spg.appname);

View File

@@ -1,23 +1,19 @@
//! This module implements Timeline lifecycle management and has all neccessary code
//! to glue together SafeKeeper and all other background services.
use anyhow::{anyhow, bail, Context, Result};
use anyhow::{anyhow, bail, Result};
use etcd_broker::subscription_value::SkTimelineInfo;
use postgres_ffi::v14::xlog_utils::XLogSegNo;
use postgres_ffi::XLogSegNo;
use serde::Serialize;
use tokio::sync::watch;
use std::cmp::{max, min};
use std::collections::HashMap;
use std::fs;
use parking_lot::{MappedMutexGuard, Mutex, MutexGuard};
use parking_lot::{Mutex, MutexGuard};
use std::io;
use std::path::PathBuf;
use std::time::Instant;
use tokio::sync::mpsc::Sender;
use tracing::*;
@@ -25,15 +21,15 @@ use tracing::*;
use utils::{
lsn::Lsn,
pq_proto::ReplicationFeedback,
zid::{NodeId, ZTenantId, ZTenantTimelineId},
zid::{NodeId, ZTenantTimelineId},
};
use crate::control_file;
use crate::safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
SafekeeperMemState, ServerInfo,
};
use crate::send_wal::HotStandbyFeedback;
use crate::{control_file, GlobalTimelines};
use crate::metrics::FullTimelineInfo;
use crate::wal_storage;
@@ -75,7 +71,7 @@ impl ReplicaState {
}
/// Shared state associated with database instance
struct SharedState {
pub struct SharedState {
/// Safekeeper object
sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
/// State of replicas
@@ -97,8 +93,8 @@ struct SharedState {
}
impl SharedState {
/// Initialize timeline state, create a control file on disk.
fn create(
/// Initialize fresh timeline state without persisting anything to disk.
fn create_new(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
state: SafeKeeperState,
@@ -107,6 +103,8 @@ impl SharedState {
bail!(TimelineError::UninitializedWalSegSize(*zttid));
}
// We don't want to write anything to disk, because we may have existing timeline there.
// These functions should not change anything on disk.
let control_store = control_file::FileStorage::create_new(zttid, conf, state)?;
let wal_store = wal_storage::PhysicalStorage::new(zttid, conf, &control_store)?;
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
@@ -148,7 +146,6 @@ impl SharedState {
})
}
fn is_active(&self) -> bool {
self.is_wal_backup_required()
// FIXME: add tracking of relevant pageservers and check them here individually,
@@ -269,225 +266,189 @@ impl SharedState {
}
}
enum TimelineState {
/// Timeline on-disk state is unknown. We either haven't tried to restore
/// timeline state from disk or got an error.
Uninitialized(UninitializedState),
/// Timeline exists on disk and loaded in memory.
Loaded(Box<SharedState>),
/// Timeline was deleted and cannot be used anymore.
Deleted,
}
impl TimelineState {
fn inner_mut(&mut self) -> &mut SharedState {
match self {
TimelineState::Loaded(state) => state,
_ => panic!("timeline state is not initialized"),
}
}
}
#[derive(Debug)]
struct UninitializedState {
/// Safekeeper config.
conf: Box<SafeKeeperConf>,
/// Error that occurred on last attempt to restore timeline state from disk.
restore_error: Option<String>,
/// Timestamp of the last restore attempt.
last_restore_attempt: Option<Instant>,
}
#[derive(Debug, thiserror::Error)]
pub enum TimelineError {
#[error("Timeline {0} was deleted and cannot be used anymore")]
Deleted(ZTenantTimelineId),
#[error("Timeline {0} was cancelled and cannot be used anymore")]
Cancelled(ZTenantTimelineId),
#[error("Timeline {0} was not found in global map")]
NotFound(ZTenantTimelineId),
#[error("Timeline {0} exists on disk, but wasn't loaded on startup")]
Invalid(ZTenantTimelineId),
#[error("Timeline {0} is already exists")]
AlreadyExists(ZTenantTimelineId),
#[error("Timeline {0} is not initialized, wal_seg_size is zero")]
UninitializedWalSegSize(ZTenantTimelineId),
#[error("Timeline {0} was not found on disk")]
NotFound(ZTenantTimelineId),
}
/// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline.
/// It also holds SharedState and provides mutually exclusive access to it.
pub struct Timeline {
pub zttid: ZTenantTimelineId,
/// Sending here asks for wal backup launcher attention (start/stop
/// offloading). Sending zttid instead of concrete command allows to do
/// sending without timeline lock.
wal_backup_launcher_tx: Sender<ZTenantTimelineId>,
/// Used to broadcast commit_lsn updates to all background jobs.
commit_lsn_watch_tx: watch::Sender<Lsn>,
/// For breeding receivers.
commit_lsn_watch_rx: watch::Receiver<Lsn>,
mutex: Mutex<TimelineState>,
/// Safekeeper and other state, that should remain consistent and synchronized
/// with the disk.
mutex: Mutex<SharedState>,
/// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
cancellation_tx: watch::Sender<bool>,
/// Timeline should not be used after cancellation. Background tasks should
/// monitor this channel and stop eventually after receiving `true` from this channel.
cancellation_rx: watch::Receiver<bool>,
/// Directory where timeline state is stored.
timeline_dir: PathBuf,
}
impl Timeline {
/// Create a new uninitialized timeline. Timeline will be tried to restore from disk
/// automatically on most function calls.
pub fn new(
/// Load existing timeline from disk.
pub fn load_timeline(
conf: SafeKeeperConf,
zttid: ZTenantTimelineId,
wal_backup_launcher_tx: Sender<ZTenantTimelineId>,
) -> Timeline {
let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
Timeline {
) -> Result<Timeline> {
let shared_state = SharedState::restore(&conf, &zttid)?;
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
watch::channel(shared_state.sk.state.commit_lsn);
let (cancellation_tx, cancellation_rx) = watch::channel(false);
Ok(Timeline {
zttid,
wal_backup_launcher_tx,
commit_lsn_watch_tx,
commit_lsn_watch_rx,
mutex: Mutex::new(TimelineState::Uninitialized(UninitializedState {
conf: Box::new(conf),
restore_error: None,
last_restore_attempt: None,
})),
}
mutex: Mutex::new(shared_state),
cancellation_rx,
cancellation_tx,
timeline_dir: conf.timeline_dir(&zttid),
})
}
/// Try to restore timeline state from disk.
fn load_from_disk(
zttid: &ZTenantTimelineId,
uninit: &mut UninitializedState,
) -> Result<SharedState> {
info!("Restoring timeline {} from disk", zttid);
/// Create a new timeline, which is not yet persisted to disk.
pub fn create_empty(
conf: SafeKeeperConf,
zttid: ZTenantTimelineId,
wal_backup_launcher_tx: Sender<ZTenantTimelineId>,
server_info: ServerInfo,
) -> Result<Timeline> {
let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
let (cancellation_tx, cancellation_rx) = watch::channel(false);
let state = SafeKeeperState::new(&zttid, server_info, vec![]);
let res = SharedState::restore(&uninit.conf, zttid);
if let Err(e) = &res {
uninit.restore_error = Some(format!("{}", e));
uninit.last_restore_attempt = Some(Instant::now());
error!("Failed to restore timeline {} from disk: {}", zttid, e);
}
res
Ok(Timeline {
zttid,
wal_backup_launcher_tx,
commit_lsn_watch_tx,
commit_lsn_watch_rx,
mutex: Mutex::new(SharedState::create_new(&conf, &zttid, state)?),
cancellation_rx,
cancellation_tx,
timeline_dir: conf.timeline_dir(&zttid),
})
}
/// Try to create a new timeline on disk.
fn create_on_disk(
zttid: &ZTenantTimelineId,
uninit: &mut UninitializedState,
state: SafeKeeperState,
) -> Result<SharedState> {
let conf = &uninit.conf;
info!("Creating timeline {} on disk", zttid);
// TODO: check directory existence
let dir = conf.timeline_dir(zttid);
fs::create_dir_all(dir)?;
SharedState::create(conf, zttid, state).context("failed to create shared state")
}
/// Initialize timeline with shared_state.
fn set_shared_state(
&self,
state: SharedState,
state_lock: &mut MutexGuard<TimelineState>,
) -> Result<()> {
assert!(matches!(&**state_lock, TimelineState::Uninitialized(_)));
self.commit_lsn_watch_tx.send(state.sk.inmem.commit_lsn)?;
**state_lock = TimelineState::Loaded(Box::new(state));
Ok(())
}
/// Require timeline state to be loaded. If it's not loaded, try to restore it from disk.
fn require_loaded(&self) -> Result<MappedMutexGuard<SharedState>> {
let mut state = self.mutex.lock();
match &mut *state {
TimelineState::Loaded(_) => {}
TimelineState::Uninitialized(uninit) => {
if let Some(err) = &uninit.restore_error {
// We have an error from last restore attempt, next attempt will not help
// unless something was changed on disk manually. If we will try to restore
// every time, we will have a lot of spam in the logs.
bail!(err.clone())
}
// TODO: we can allow restoring once a minute, to automatically recover from
// restore failure when something was changed on disk.
let shared_state = Self::load_from_disk(&self.zttid, uninit)?;
self.set_shared_state(shared_state, &mut state)?;
}
TimelineState::Deleted => bail!(TimelineError::Deleted(self.zttid)),
}
Ok(MutexGuard::map(state, TimelineState::inner_mut))
}
/// Try to load timeline state from disk. If timeline control file is not found,
/// create and initialize a state for the new timeline.
pub fn init_create_or_load(&self, server_info: ServerInfo) -> Result<()> {
let mut state_lock = self.mutex.lock();
match &mut *state_lock {
TimelineState::Uninitialized(uninit) => {
let shared_state = Self::load_from_disk(&self.zttid, uninit);
match shared_state {
Ok(shared_state) => {
// restored successfully
self.set_shared_state(shared_state, &mut state_lock)?;
Ok(())
}
Err(e) => {
if let Some(e) = e.downcast_ref::<TimelineError>() {
if matches!(e, TimelineError::NotFound(_)) {
// timeline does not exist on disk, create new one
let state = SafeKeeperState::new(&self.zttid, server_info, vec![]);
let shared_state =
Self::create_on_disk(&self.zttid, uninit, state)?;
self.set_shared_state(shared_state, &mut state_lock)?;
return Ok(());
}
}
Err(e)
}
}
}
TimelineState::Loaded(_) => Ok(()),
TimelineState::Deleted => bail!(TimelineError::Deleted(self.zttid)),
}
}
/// Deactivates and marks timeline as deleted. Returns whether the timeline was
/// already active. The timeline can no longer be used after this call, almost
/// all functions will return `TimelineError::Deleted`.
/// Initialize fresh timeline on disk and start background tasks. If bootstrap
/// fails, timeline is cancelled and cannot be used anymore.
///
/// We assume all threads will stop by themselves eventually (possibly with errors,
/// but no panics). There should be no compute threads (as we're deleting the timeline),
/// actually. Some WAL may be left unsent, but we're deleting the timeline anyway.
async fn delete(&self) -> bool {
let was_active = {
let mut state_lock = self.mutex.lock();
let was_active = match &*state_lock {
TimelineState::Loaded(state) => state.active,
TimelineState::Uninitialized(_) => false,
/// Bootstrap is transactional, so if it fails, created files will be deleted,
/// and state on disk should remain unchanged.
pub fn bootstrap(&self, shared_state: &mut MutexGuard<SharedState>) -> Result<()> {
match std::fs::metadata(&self.timeline_dir) {
Ok(_) => {
// Timeline directory exists on disk, we should leave state unchanged
// and return error.
bail!(TimelineError::Invalid(self.zttid));
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => {
return Err(e.into());
}
}
// already deleted
TimelineState::Deleted => return false,
};
// Create timeline directory.
std::fs::create_dir_all(&self.timeline_dir)?;
*state_lock = TimelineState::Deleted;
was_active
};
// Write timeline to disk and TODO: start background tasks.
match || -> Result<()> {
shared_state.sk.persist()?;
// TODO: add more initialization steps here
Ok(())
}() {
Ok(_) => Ok(()),
Err(e) => {
// Bootstrap failed, cancel timeline and remove timeline directory.
self.cancel();
// XXX: we can have a cancellation channel to notify all tasks that they should stop
if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) {
warn!(
"failed to remove timeline {} directory after bootstrap failure: {}",
self.zttid, fs_err
);
}
let res = self.wal_backup_launcher_tx.send(self.zttid).await;
Err(e)
}
}
}
/// Delete timeline from disk completely, by removing timeline directory. Background
/// timeline activities will stop eventually.
pub fn delete_from_disk(
&self,
shared_state: &mut MutexGuard<SharedState>,
) -> Result<(bool, bool)> {
let was_active = shared_state.active;
self.cancel();
let dir_existed = delete_dir(&self.timeline_dir)?;
Ok((dir_existed, was_active))
}
/// Cancel timeline to prevent further usage. Background tasks will stop
/// eventually after receiving cancellation signal.
fn cancel(&self) {
info!("Timeline {} is cancelled", self.zttid);
let _ = self.cancellation_tx.send(true);
let res = self.wal_backup_launcher_tx.blocking_send(self.zttid);
if let Err(e) = res {
error!("Failed to send stop signal to wal_backup_launcher: {}", e);
}
}
was_active
/// Returns if timeline is cancelled.
pub fn is_cancelled(&self) -> bool {
*self.cancellation_rx.borrow()
}
/// Take a writing mutual exclusive lock on timeline shared_state.
pub fn write_shared_state(&self) -> MutexGuard<SharedState> {
self.mutex.lock()
}
/// Register compute connection, starting timeline-related activity if it is
/// not running yet.
pub fn on_compute_connect(&self) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.zttid));
}
let is_wal_backup_action_pending: bool;
{
let mut shared_state = self.require_loaded()?;
let mut shared_state = self.write_shared_state();
shared_state.num_computes += 1;
is_wal_backup_action_pending = shared_state.update_status(self.zttid);
}
// Wake up wal backup launcher, if offloading not started yet.
if is_wal_backup_action_pending {
// Can fail only if channel to a static thread got closed, which is not normal at all.
self.wal_backup_launcher_tx.blocking_send(self.zttid)?;
}
Ok(())
@@ -496,14 +457,19 @@ impl Timeline {
/// De-register compute connection, shutting down timeline activity if
/// pageserver doesn't need catchup.
pub fn on_compute_disconnect(&self) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.zttid));
}
let is_wal_backup_action_pending: bool;
{
let mut shared_state = self.require_loaded()?;
let mut shared_state = self.write_shared_state();
shared_state.num_computes -= 1;
is_wal_backup_action_pending = shared_state.update_status(self.zttid);
}
// Wake up wal backup launcher, if it is time to stop the offloading.
if is_wal_backup_action_pending {
// Can fail only if channel to a static thread got closed, which is not normal at all.
self.wal_backup_launcher_tx.blocking_send(self.zttid)?;
}
Ok(())
@@ -511,8 +477,12 @@ impl Timeline {
/// Returns true if walsender should stop sending WAL to pageserver.
/// TODO: check this pageserver is actually interested in this timeline.
pub fn should_walsender_stop(&self, replica_id: usize) -> Result<bool> {
let mut shared_state = self.require_loaded()?;
pub fn should_walsender_stop(&self, replica_id: usize) -> bool {
if self.is_cancelled() {
return true;
}
let mut shared_state = self.write_shared_state();
if shared_state.num_computes == 0 {
let replica_state = shared_state.replicas[replica_id].unwrap();
let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
@@ -520,59 +490,62 @@ impl Timeline {
replica_state.remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn);
if stop {
shared_state.update_status(self.zttid);
return Ok(true);
return true;
}
}
Ok(false)
false
}
/// Returns whether s3 offloading is required and sets current status as
/// matching it.
pub fn wal_backup_attend(&self) -> bool {
let shared_state = self.require_loaded();
if shared_state.is_err() {
if self.is_cancelled() {
return false;
}
let mut shared_state = shared_state.unwrap();
shared_state.wal_backup_attend()
self.write_shared_state().wal_backup_attend()
}
/// Can this safekeeper offload to s3? Recently joined safekeepers might not
/// have necessary WAL.
pub fn can_wal_backup(&self) -> bool {
self.require_loaded()
.map(|state| state.can_wal_backup())
.unwrap_or(false)
if self.is_cancelled() {
return false;
}
let shared_state = self.write_shared_state();
shared_state.can_wal_backup()
}
/// Returns full timeline info, required for the metrics. If the timeline is
/// not active, returns None instead.
pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
self.require_loaded()
.map(|state| {
if state.active {
Some(FullTimelineInfo {
zttid: self.zttid,
replicas: state
.replicas
.iter()
.filter_map(|r| r.as_ref())
.copied()
.collect(),
wal_backup_active: state.wal_backup_active,
timeline_is_active: state.active,
num_computes: state.num_computes,
last_removed_segno: state.last_removed_segno,
epoch_start_lsn: state.sk.epoch_start_lsn,
mem_state: state.sk.inmem.clone(),
persisted_state: state.sk.state.clone(),
flush_lsn: state.sk.wal_store.flush_lsn(),
})
} else {
None
}
if self.is_cancelled() {
return None;
}
let state = self.write_shared_state();
if state.active {
Some(FullTimelineInfo {
zttid: self.zttid,
replicas: state
.replicas
.iter()
.filter_map(|r| r.as_ref())
.copied()
.collect(),
wal_backup_active: state.wal_backup_active,
timeline_is_active: state.active,
num_computes: state.num_computes,
last_removed_segno: state.last_removed_segno,
epoch_start_lsn: state.sk.epoch_start_lsn,
mem_state: state.sk.inmem.clone(),
persisted_state: state.sk.state.clone(),
flush_lsn: state.sk.wal_store.flush_lsn(),
})
.unwrap_or(None)
} else {
None
}
}
/// Returns commit_lsn watch channel. Channel should be obtained only after
@@ -586,10 +559,14 @@ impl Timeline {
&self,
msg: &ProposerAcceptorMessage,
) -> Result<Option<AcceptorProposerMessage>> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.zttid));
}
let mut rmsg: Option<AcceptorProposerMessage>;
let commit_lsn: Lsn;
{
let mut shared_state = self.require_loaded()?;
let mut shared_state = self.write_shared_state();
rmsg = shared_state.sk.process_msg(msg)?;
// if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn
@@ -607,42 +584,47 @@ impl Timeline {
Ok(rmsg)
}
/// Returns wal_seg_size or None if timeline is not loaded.
pub fn get_wal_seg_size(&self) -> Result<usize> {
self.require_loaded().map(|state| state.get_wal_seg_size())
/// Returns wal_seg_size.
pub fn get_wal_seg_size(&self) -> usize {
self.write_shared_state().get_wal_seg_size()
}
/// Returns true only if the timeline is loaded and active.
pub fn is_active(&self) -> bool {
self.require_loaded()
.map(|state| state.active)
.unwrap_or(false)
if self.is_cancelled() {
return false;
}
self.write_shared_state().active
}
/// Returns state of the timeline or None if timeline is not loaded.
pub fn get_state(&self) -> Result<(SafekeeperMemState, SafeKeeperState)> {
self.require_loaded()
.map(|state| (state.sk.inmem.clone(), state.sk.state.clone()))
/// Returns state of the timeline.
pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
let state = self.write_shared_state();
(state.sk.inmem.clone(), state.sk.state.clone())
}
/// Returns backup_lsn or None if timeline is not loaded.
pub fn get_wal_backup_lsn(&self) -> Result<Lsn> {
self.require_loaded().map(|state| state.sk.inmem.backup_lsn)
/// Returns latest backup_lsn.
pub fn get_wal_backup_lsn(&self) -> Lsn {
self.write_shared_state().sk.inmem.backup_lsn
}
/// Sets backup_lsn to the given value.
pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
let mut shared_state = self.require_loaded()?;
shared_state.sk.inmem.backup_lsn = backup_lsn;
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.zttid));
}
self.write_shared_state().sk.inmem.backup_lsn = backup_lsn;
// we should check whether to shut down offloader, but this will be done
// soon by peer communication anyway.
Ok(())
}
/// Return public safekeeper info for broadcasting to broker and other peers.
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> Result<SkTimelineInfo> {
let shared_state = self.require_loaded()?;
Ok(SkTimelineInfo {
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
let shared_state = self.write_shared_state();
SkTimelineInfo {
last_log_term: Some(shared_state.sk.get_epoch()),
flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
// note: this value is not flushed to control file yet and can be lost
@@ -655,7 +637,7 @@ impl Timeline {
peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
})
}
}
/// Update timeline state with peer safekeeper data.
@@ -667,7 +649,7 @@ impl Timeline {
let is_wal_backup_action_pending: bool;
let commit_lsn: Lsn;
{
let mut shared_state = self.require_loaded()?;
let mut shared_state = self.write_shared_state();
shared_state.sk.record_safekeeper_info(sk_info)?;
is_wal_backup_action_pending = shared_state.update_status(self.zttid);
commit_lsn = shared_state.sk.inmem.commit_lsn;
@@ -681,39 +663,39 @@ impl Timeline {
}
/// Add send_wal replica to the in-memory vector of replicas.
pub fn add_replica(&self, state: ReplicaState) -> Result<usize> {
let mut shared_state = self.require_loaded()?;
Ok(shared_state.add_replica(state))
pub fn add_replica(&self, state: ReplicaState) -> usize {
self.write_shared_state().add_replica(state)
}
/// Update replication replica state.
pub fn update_replica_state(&self, id: usize, state: ReplicaState) -> Result<()> {
let mut shared_state = self.require_loaded()?;
pub fn update_replica_state(&self, id: usize, state: ReplicaState) {
let mut shared_state = self.write_shared_state();
shared_state.replicas[id] = Some(state);
Ok(())
}
/// Remove send_wal replica from the in-memory vector of replicas.
pub fn remove_replica(&self, id: usize) -> Result<()> {
let mut shared_state = self.require_loaded()?;
pub fn remove_replica(&self, id: usize) {
let mut shared_state = self.write_shared_state();
assert!(shared_state.replicas[id].is_some());
shared_state.replicas[id] = None;
Ok(())
}
/// Returns flush_lsn.
pub fn get_flush_lsn(&self) -> Result<Lsn> {
let shared_state = self.require_loaded()?;
Ok(shared_state.sk.wal_store.flush_lsn())
pub fn get_flush_lsn(&self) -> Lsn {
self.write_shared_state().sk.wal_store.flush_lsn()
}
/// Delete WAL segments from disk that are no longer needed. This is determined
/// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.zttid));
}
let horizon_segno: XLogSegNo;
let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
{
let shared_state = self.require_loaded()?;
let shared_state = self.write_shared_state();
horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
remover = shared_state.sk.wal_store.remove_up_to();
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
@@ -726,100 +708,17 @@ impl Timeline {
remover(horizon_segno - 1)?;
// update last_removed_segno
let mut shared_state = self.require_loaded()?;
let mut shared_state = self.write_shared_state();
shared_state.last_removed_segno = horizon_segno;
Ok(())
}
}
#[derive(Clone, Copy, Serialize)]
pub struct TimelineDeleteForceResult {
pub dir_existed: bool,
pub was_active: bool,
}
/// Deletes directory and it's contents. Returns false if directory does not exist.
fn delete_dir(path: PathBuf) -> Result<bool> {
fn delete_dir(path: &PathBuf) -> Result<bool> {
match std::fs::remove_dir_all(path) {
Ok(_) => Ok(true),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false),
Err(e) => Err(e.into()),
}
}
/// Deactivates and sets TimelineState to Deleted, see `Timeline::delete()`, then deletes the
/// corresponding data directory. We assume all timeline threads will eventually check that
/// timeline is Deleted and terminate without panics.
///
/// Timeline cannot be recreated because we keep all deleted timelines in memory. It can be
/// accidentally created again if safekeeper will restart and compute will connect to it, but this
/// is very unlikely.
pub async fn delete_force(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
) -> Result<TimelineDeleteForceResult> {
info!("deleting timeline {}", zttid);
let timeline = GlobalTimelines::get(*zttid);
let was_active = timeline.delete().await;
Ok(TimelineDeleteForceResult {
dir_existed: delete_dir(conf.timeline_dir(zttid))?,
was_active,
})
}
/// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which
/// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
/// created simultaneously. In that case the function will return error and the caller should
/// try to delete tenant again later.
pub async fn delete_force_all_for_tenant(
conf: &SafeKeeperConf,
tenant_id: &ZTenantId,
) -> Result<HashMap<ZTenantTimelineId, TimelineDeleteForceResult>> {
info!("deleting all timelines for tenant {}", tenant_id);
let to_delete = GlobalTimelines::get_all_for_tenant(*tenant_id);
let mut err = None;
let mut deleted = HashMap::new();
for tli in &to_delete {
let was_active = tli.delete().await;
let res = delete_dir(conf.timeline_dir(&tli.zttid));
match res {
Ok(dir_existed) => {
deleted.insert(
tli.zttid,
TimelineDeleteForceResult {
dir_existed,
was_active,
},
);
}
Err(e) => {
error!("failed to delete timeline {}: {}", tli.zttid, e);
// Save error to return later.
err = Some(e);
}
}
}
// There may be inactive timelines, so delete the whole tenant dir as well.
delete_dir(conf.tenant_dir(tenant_id))?;
let tlis_after_delete = GlobalTimelines::get_all_for_tenant(*tenant_id);
if tlis_after_delete.len() != to_delete.len() {
// Some timelines were created while we were deleting them, returning error
// to the caller, so it can retry later.
bail!(
"failed to delete all timelines for tenant {}: some timelines were created while we were deleting them",
tenant_id
);
}
if let Some(e) = err {
Err(e)
} else {
Ok(deleted)
}
}

View File

@@ -1,12 +1,20 @@
//! This module contains global (tenant_id, timeline_id) -> Arc<Timeline> mapping.
//! All timelines should always be present in this map, this is done by loading them
//! all from the disk on startup and keeping them in memory.
use crate::timeline::Timeline;
use crate::safekeeper::ServerInfo;
use crate::timeline::{Timeline, TimelineError};
use crate::SafeKeeperConf;
use anyhow::{anyhow, bail, Context, Result};
use once_cell::sync::Lazy;
use serde::Serialize;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::{Arc, Mutex, MutexGuard};
use tokio::sync::mpsc::Sender;
use utils::zid::{ZTenantId, ZTenantTimelineId};
use tracing::*;
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
struct GlobalTimelinesState {
timelines: HashMap<ZTenantTimelineId, Arc<Timeline>>,
@@ -14,6 +22,34 @@ struct GlobalTimelinesState {
conf: SafeKeeperConf,
}
impl GlobalTimelinesState {
/// Get dependencies for a timeline constructor.
fn get_dependencies(&self) -> (SafeKeeperConf, Sender<ZTenantTimelineId>) {
(
self.conf.clone(),
self.wal_backup_launcher_tx.as_ref().unwrap().clone(),
)
}
/// Insert timeline into the map. Returns error if timeline with the same id already exists.
fn try_insert(&mut self, timeline: Arc<Timeline>) -> Result<()> {
let zttid = timeline.zttid;
if self.timelines.contains_key(&zttid) {
bail!(TimelineError::AlreadyExists(zttid));
}
self.timelines.insert(zttid, timeline);
Ok(())
}
/// Get timeline from the map. Returns error if timeline doesn't exist.
fn get(&self, zttid: &ZTenantTimelineId) -> Result<Arc<Timeline>> {
self.timelines
.get(zttid)
.cloned()
.ok_or_else(|| anyhow!(TimelineError::NotFound(*zttid)))
}
}
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
Mutex::new(GlobalTimelinesState {
timelines: HashMap::new(),
@@ -26,41 +62,184 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
pub struct GlobalTimelines;
impl GlobalTimelines {
// Inject dependencies needed for the timeline constructors.
pub fn init(conf: SafeKeeperConf, wal_backup_launcher_tx: Sender<ZTenantTimelineId>) {
/// Inject dependencies needed for the timeline constructors and load all timelines to memory.
pub fn init(
conf: SafeKeeperConf,
wal_backup_launcher_tx: Sender<ZTenantTimelineId>,
) -> Result<()> {
let mut state = TIMELINES_STATE.lock().unwrap();
assert!(state.wal_backup_launcher_tx.is_none());
state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
state.conf = conf;
// Iterate through all directories and load tenants for all directories
// named as a valid tenant_id.
let mut tenant_count = 0;
let tenants_dir = state.conf.workdir.clone();
for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
.with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))?
{
match &tenants_dir_entry {
Ok(tenants_dir_entry) => {
if let Ok(tenant_id) =
ZTenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
{
tenant_count += 1;
GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?;
}
}
Err(e) => error!(
"failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
tenants_dir_entry,
tenants_dir.display(),
e
),
}
}
info!(
"found {} tenants directories, successfully loaded {} timelines",
tenant_count,
state.timelines.len()
);
Ok(())
}
/// Get a timeline from the global map. If it doesn't exist in the map, new uninitialised timeline
/// will be added to the map.
pub fn get(zttid: ZTenantTimelineId) -> Arc<Timeline> {
let mut global_lock = TIMELINES_STATE.lock().unwrap();
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any.
fn load_tenant_timelines(
state: &mut MutexGuard<GlobalTimelinesState>,
tenant_id: ZTenantId,
) -> Result<()> {
let timelines_dir = state.conf.tenant_dir(&tenant_id);
for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
.with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))?
{
match &timelines_dir_entry {
Ok(timeline_dir_entry) => {
if let Ok(timeline_id) =
ZTimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
{
let zttid = ZTenantTimelineId::new(tenant_id, timeline_id);
match Timeline::load_timeline(
state.conf.clone(),
zttid,
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
) {
Ok(timeline) => {
state.timelines.insert(zttid, Arc::new(timeline));
}
// If we can't load a timeline, it's most likely because of a corrupted
// directory. We will log an error and won't allow to delete/recreate
// this timeline. The only way to fix this timeline is to repair manually
// and restart the safekeeper.
Err(e) => error!(
"failed to load timeline {} for tenant {}, reason: {:?}",
timeline_id, tenant_id, e
),
}
}
}
Err(e) => error!(
"failed to list timelines dir entry {:?} in directory {}, reason: {:?}",
timelines_dir_entry,
timelines_dir.display(),
e
),
}
}
Ok(())
}
/// Create a new timeline with the given id. If the timeline already exists, returns
/// an existing timeline.
pub fn create(zttid: ZTenantTimelineId, server_info: ServerInfo) -> Result<Arc<Timeline>> {
let (conf, wal_backup_launcher_tx) = {
let state = TIMELINES_STATE.lock().unwrap();
if let Ok(timeline) = state.get(&zttid) {
// Timeline already exists, return it.
return Ok(timeline);
}
state.get_dependencies()
};
info!("creating new timeline {}", zttid);
let timeline = Arc::new(Timeline::create_empty(
conf,
zttid,
wal_backup_launcher_tx,
server_info,
)?);
// Take a lock and finish the initialization holding this mutex. No other threads
// can interfere with creation after we will insert timeline into the map.
let mut shared_state = timeline.write_shared_state();
// We can get a race condition here in case of concurrent create calls, but only
// in theory. create() will return valid timeline on the next try.
TIMELINES_STATE
.lock()
.unwrap()
.try_insert(timeline.clone())?;
// Write the new timeline to the disk and start background workers.
// Bootstrap is transactional, so if it fails, the timeline will be deleted,
// and the state on disk should remain unchanged.
match timeline.bootstrap(&mut shared_state) {
Ok(_) => {
// We are done with bootstrap, release the lock, return the timeline.
drop(shared_state);
Ok(timeline)
}
Err(e) => {
// Note: the most likely reason for bootstrap failure is that the timeline
// directory already exists on disk. This happens when timeline is corrupted
// and wasn't loaded from disk on startup because of that. We want to preserve
// the timeline directory in this case, for further inspection.
// TODO: this is an unusual error, perhaps we should send it to sentry
// TODO: compute will try to create timeline every second, we should add backoff
error!("failed to bootstrap timeline {}: {}", zttid, e);
// Timeline failed to bootstrap, it cannot be used. Remove it from the map.
TIMELINES_STATE.lock().unwrap().timelines.remove(&zttid);
Err(e)
}
}
}
/// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
/// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
/// i.e. loaded in memory and not cancelled.
pub fn get(zttid: ZTenantTimelineId) -> Result<Arc<Timeline>> {
let global_lock = TIMELINES_STATE.lock().unwrap();
match global_lock.timelines.get(&zttid) {
Some(result) => Arc::clone(result),
None => {
let tli = Arc::new(Timeline::new(
global_lock.conf.clone(),
zttid,
global_lock.wal_backup_launcher_tx.as_ref().unwrap().clone(),
));
global_lock.timelines.insert(zttid, tli.clone());
tli
Some(result) => {
if result.is_cancelled() {
anyhow::bail!(TimelineError::Cancelled(zttid));
}
Ok(Arc::clone(result))
}
None => anyhow::bail!(TimelineError::NotFound(zttid)),
}
}
/// Returns all timelines. This is used for background timeline proccesses.
pub fn get_all() -> Vec<Arc<Timeline>> {
let global_lock = TIMELINES_STATE.lock().unwrap();
global_lock.timelines.values().cloned().collect()
global_lock
.timelines
.values()
.cloned()
.filter(|t| !t.is_cancelled())
.collect()
}
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant.
pub fn get_all_for_tenant(tenant_id: ZTenantId) -> Vec<Arc<Timeline>> {
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
/// and that's why it can return cancelled timelines, to retry deleting them.
fn get_all_for_tenant(tenant_id: ZTenantId) -> Vec<Arc<Timeline>> {
let global_lock = TIMELINES_STATE.lock().unwrap();
global_lock
.timelines
@@ -69,4 +248,88 @@ impl GlobalTimelines {
.cloned()
.collect()
}
/// Cancels timeline, then deletes the corresponding data directory.
pub fn delete_force(zttid: &ZTenantTimelineId) -> Result<TimelineDeleteForceResult> {
let timeline = TIMELINES_STATE.lock().unwrap().get(zttid)?;
// Take a lock and finish the deletion holding this mutex.
let mut shared_state = timeline.write_shared_state();
info!("deleting timeline {}", zttid);
let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?;
// Remove timeline from the map.
TIMELINES_STATE.lock().unwrap().timelines.remove(zttid);
Ok(TimelineDeleteForceResult {
dir_existed,
was_active,
})
}
/// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which
/// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
/// created simultaneously. In that case the function will return error and the caller should
/// retry tenant deletion again later.
pub fn delete_force_all_for_tenant(
tenant_id: &ZTenantId,
) -> Result<HashMap<ZTenantTimelineId, TimelineDeleteForceResult>> {
info!("deleting all timelines for tenant {}", tenant_id);
let to_delete = Self::get_all_for_tenant(*tenant_id);
let mut err = None;
let mut deleted = HashMap::new();
for tli in &to_delete {
match Self::delete_force(&tli.zttid) {
Ok(result) => {
deleted.insert(tli.zttid, result);
}
Err(e) => {
error!("failed to delete timeline {}: {}", tli.zttid, e);
// Save error to return later.
err = Some(e);
}
}
}
// If there was an error, return it.
if let Some(e) = err {
return Err(e);
}
// There may be broken timelines on disk, so delete the whole tenant dir as well.
// Note that we could concurrently create new timelines while we were deleting them,
// so the directory may be not empty. In this case timelines will have bad state
// and timeline background jobs can panic.
delete_dir(TIMELINES_STATE.lock().unwrap().conf.tenant_dir(tenant_id))?;
let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
if !tlis_after_delete.is_empty() {
// Some timelines were created while we were deleting them, returning error
// to the caller, so it can retry later.
bail!(
"failed to delete all timelines for tenant {}: some timelines were created while we were deleting them",
tenant_id
);
}
Ok(deleted)
}
}
#[derive(Clone, Copy, Serialize)]
pub struct TimelineDeleteForceResult {
pub dir_existed: bool,
pub was_active: bool,
}
/// Deletes directory and it's contents. Returns false if directory does not exist.
fn delete_dir(path: PathBuf) -> Result<bool> {
match std::fs::remove_dir_all(path) {
Ok(_) => Ok(true),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false),
Err(e) => Err(e.into()),
}
}

View File

@@ -54,7 +54,9 @@ pub fn wal_backup_launcher_thread_main(
/// Check whether wal backup is required for timeline. If yes, mark that launcher is
/// aware of current status and return the timeline.
fn is_wal_backup_required(zttid: ZTenantTimelineId) -> Option<Arc<Timeline>> {
Some(GlobalTimelines::get(zttid)).filter(|tli| tli.wal_backup_attend())
GlobalTimelines::get(zttid)
.ok()
.filter(|tli| tli.wal_backup_attend())
}
struct WalBackupTaskHandle {
@@ -200,15 +202,15 @@ async fn backup_task_main(
election: Election,
) {
info!("started");
let tli = GlobalTimelines::get(zttid);
let res = tli.get_wal_seg_size();
let res = GlobalTimelines::get(zttid);
if let Err(e) = res {
info!("backup error for timeline {}: {}", zttid, e);
return;
}
let tli = res.unwrap();
let mut wb = WalBackupTask {
wal_seg_size: res.unwrap(),
wal_seg_size: tli.get_wal_seg_size(),
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
timeline: tli,
timeline_dir,
@@ -279,12 +281,7 @@ impl WalBackupTask {
continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
}
// Perhaps peers advanced the position, check shmem value.
let res = self.timeline.get_wal_backup_lsn();
if let Err(e) = res {
error!("backup error: {}", e);
return;
}
backup_lsn = res.unwrap();
backup_lsn = self.timeline.get_wal_backup_lsn();
if backup_lsn.segment_number(self.wal_seg_size)
>= commit_lsn.segment_number(self.wal_seg_size)
{

View File

@@ -156,16 +156,14 @@ pub struct PhysicalStorage {
}
impl PhysicalStorage {
/// Create new storage. If commit_lsn is not zero, flush_lsn is tried to be restored from
/// the disk. Otherwise, all LSNs are set to zero.
pub fn new(
zttid: &ZTenantTimelineId,
conf: &SafeKeeperConf,
state: &SafeKeeperState,
) -> Result<PhysicalStorage> {
let timeline_dir = conf.timeline_dir(zttid);
if state.server.wal_seg_size == 0 {
bail!("wal_seg_size must be initialized before creating PhysicalStorage");
}
let wal_seg_size = state.server.wal_seg_size as usize;
// Find out where stored WAL ends, starting at commit_lsn which is a
@@ -399,7 +397,7 @@ impl Storage for PhysicalStorage {
/// end_pos must point to the end of the WAL record.
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
// Streaming must not create a hole, so truncate cannot be called on non-written lsn
if self.write_lsn != Lsn(0) && end_pos >= self.write_lsn {
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
bail!(
"truncate_wal called on non-written WAL, write_lsn={}, end_pos={}",
self.write_lsn,