Files
neon/pageserver/src/timelines.rs
2022-03-10 19:38:58 +02:00

403 lines
14 KiB
Rust

//!
//! Timeline management code
//
use anyhow::{anyhow, bail, Context, Result};
use postgres_ffi::ControlFileData;
use std::{
fs,
path::Path,
process::{Command, Stdio},
sync::Arc,
};
use tracing::*;
use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use zenith_utils::{crashsafe_dir, logging};
use crate::{config::PageServerConf, repository::Repository};
use crate::{import_datadir, LOG_FILE_NAME};
use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager};
use crate::{repository::RepositoryTimeline, tenant_mgr};
use crate::{repository::Timeline, CheckpointConfig};
#[derive(Clone)]
pub enum TimelineInfo {
Local {
timeline_id: ZTimelineId,
tenant_id: ZTenantId,
last_record_lsn: Lsn,
prev_record_lsn: Lsn,
ancestor_timeline_id: Option<ZTimelineId>,
ancestor_lsn: Option<Lsn>,
disk_consistent_lsn: Lsn,
current_logical_size: usize,
current_logical_size_non_incremental: Option<usize>,
},
Remote {
timeline_id: ZTimelineId,
tenant_id: ZTenantId,
disk_consistent_lsn: Lsn,
},
}
impl TimelineInfo {
pub fn from_repo_timeline(
tenant_id: ZTenantId,
repo_timeline: RepositoryTimeline,
include_non_incremental_logical_size: bool,
) -> Self {
match repo_timeline {
RepositoryTimeline::Local { id, timeline } => {
let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
let ancestor_lsn = if ancestor_timeline_id.is_some() {
Some(timeline.get_ancestor_lsn())
} else {
None
};
Self::Local {
timeline_id: id,
tenant_id,
last_record_lsn: timeline.get_last_record_lsn(),
prev_record_lsn: timeline.get_prev_record_lsn(),
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
current_logical_size: timeline.get_current_logical_size(),
current_logical_size_non_incremental: get_current_logical_size_non_incremental(
include_non_incremental_logical_size,
timeline.as_ref(),
),
}
}
RepositoryTimeline::Remote {
id,
disk_consistent_lsn,
} => Self::Remote {
timeline_id: id,
tenant_id,
disk_consistent_lsn,
},
}
}
pub fn from_dyn_timeline(
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
timeline: &dyn Timeline,
include_non_incremental_logical_size: bool,
) -> Self {
let ancestor_timeline_id = timeline.get_ancestor_timeline_id();
let ancestor_lsn = if ancestor_timeline_id.is_some() {
Some(timeline.get_ancestor_lsn())
} else {
None
};
Self::Local {
timeline_id,
tenant_id,
last_record_lsn: timeline.get_last_record_lsn(),
prev_record_lsn: timeline.get_prev_record_lsn(),
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
current_logical_size: timeline.get_current_logical_size(),
current_logical_size_non_incremental: get_current_logical_size_non_incremental(
include_non_incremental_logical_size,
timeline,
),
}
}
pub fn timeline_id(&self) -> ZTimelineId {
match *self {
TimelineInfo::Local { timeline_id, .. } => timeline_id,
TimelineInfo::Remote { timeline_id, .. } => timeline_id,
}
}
pub fn tenant_id(&self) -> ZTenantId {
match *self {
TimelineInfo::Local { tenant_id, .. } => tenant_id,
TimelineInfo::Remote { tenant_id, .. } => tenant_id,
}
}
}
fn get_current_logical_size_non_incremental(
include_non_incremental_logical_size: bool,
timeline: &dyn Timeline,
) -> Option<usize> {
if !include_non_incremental_logical_size {
return None;
}
match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) {
Ok(size) => Some(size),
Err(e) => {
error!("Failed to get non-incremental logical size: {:?}", e);
None
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct PointInTime {
pub timeline_id: ZTimelineId,
pub lsn: Lsn,
}
pub fn init_pageserver(
conf: &'static PageServerConf,
create_tenant: Option<ZTenantId>,
initial_timeline_id: Option<ZTimelineId>,
) -> anyhow::Result<()> {
// Initialize logger
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
let _log_file = logging::init(LOG_FILE_NAME, true)?;
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
// process during repository initialization.
//
// FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched
// initdb in the background, and it kept running even after the "zenith init" had exited.
// In tests, we started the page server immediately after that, so that initdb was still
// running in the background, and we failed to run initdb again in the same directory. This
// has been solved for the rapid init+start case now, but the general race condition remains
// if you restart the server quickly. The WAL redo manager doesn't use a separate thread
// anymore, but I think that could still happen.
let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {});
crashsafe_dir::create_dir_all(conf.tenants_path())?;
if let Some(tenant_id) = create_tenant {
println!("initializing tenantid {}", tenant_id);
let repo = create_repo(conf, tenant_id, dummy_redo_mgr)
.context("failed to create repo")?
.ok_or_else(|| anyhow!("For newely created pageserver, found already existing repository for tenant {}", tenant_id))?;
let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())
.context("failed to create initial timeline")?;
println!("initial timeline {} created", new_timeline_id)
} else if initial_timeline_id.is_some() {
println!("Ignoring initial timeline parameter, due to no tenant id to create given");
}
println!("pageserver init succeeded");
Ok(())
}
pub fn create_repo(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
) -> Result<Option<Arc<dyn Repository>>> {
let repo_dir = conf.tenant_path(&tenant_id);
if repo_dir.exists() {
debug!("repo for {} already exists", tenant_id);
return Ok(None);
}
// top-level dir may exist if we are creating it through CLI
crashsafe_dir::create_dir_all(&repo_dir)
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?;
info!("created directory structure in {}", repo_dir.display());
Ok(Some(Arc::new(LayeredRepository::new(
conf,
wal_redo_manager,
tenant_id,
conf.remote_storage_config.is_some(),
))))
}
// Returns checkpoint LSN from controlfile
fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
// Read control file to extract the LSN
let controlfile_path = path.join("global").join("pg_control");
let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?;
let lsn = controlfile.checkPoint;
Ok(Lsn(lsn))
}
// Create the cluster temporarily in 'initdbpath' directory inside the repository
// to get bootstrap data for timeline initialization.
//
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
info!("running initdb in {}... ", initdbpath.display());
let initdb_path = conf.pg_bin_dir().join("initdb");
let initdb_output = Command::new(initdb_path)
.args(&["-D", initdbpath.to_str().unwrap()])
.args(&["-U", &conf.superuser])
.args(&["-E", "utf8"])
.arg("--no-instructions")
// This is only used for a temporary installation that is deleted shortly after,
// so no need to fsync it
.arg("--no-sync")
.env_clear()
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.stdout(Stdio::null())
.output()
.context("failed to execute initdb")?;
if !initdb_output.status.success() {
bail!(
"initdb failed: '{}'",
String::from_utf8_lossy(&initdb_output.stderr)
);
}
Ok(())
}
//
// - run initdb to init temporary instance and get bootstrap data
// - after initialization complete, remove the temp dir.
//
fn bootstrap_timeline(
conf: &'static PageServerConf,
tenantid: ZTenantId,
tli: ZTimelineId,
repo: &dyn Repository,
) -> Result<Arc<dyn Timeline>> {
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
// Init temporarily repo to get bootstrap data
run_initdb(conf, &initdb_path)?;
let pgdata_path = initdb_path;
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
// Import the contents of the data directory at the initial checkpoint
// LSN, and any WAL after that.
// Initdb lsn will be equal to last_record_lsn which will be set after import.
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
let timeline = repo.create_empty_timeline(tli, lsn)?;
import_datadir::import_timeline_from_postgres_datadir(
&pgdata_path,
timeline.writer().as_ref(),
lsn,
)?;
timeline.checkpoint(CheckpointConfig::Forced)?;
println!(
"created initial timeline {} timeline.lsn {}",
tli,
timeline.get_last_record_lsn()
);
// Remove temp dir. We don't need it anymore
fs::remove_dir_all(pgdata_path)?;
Ok(timeline)
}
pub(crate) fn get_timelines(
tenant_id: ZTenantId,
include_non_incremental_logical_size: bool,
) -> Result<Vec<TimelineInfo>> {
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
.with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?;
Ok(repo
.list_timelines()
.with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))?
.into_iter()
.filter_map(|timeline| match timeline {
RepositoryTimeline::Local { timeline, id } => Some((id, timeline)),
RepositoryTimeline::Remote { .. } => None,
})
.map(|(timeline_id, timeline)| {
TimelineInfo::from_dyn_timeline(
tenant_id,
timeline_id,
timeline.as_ref(),
include_non_incremental_logical_size,
)
})
.collect())
}
pub(crate) fn create_timeline(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
new_timeline_id: Option<ZTimelineId>,
ancestor_timeline_id: Option<ZTimelineId>,
ancestor_start_lsn: Option<Lsn>,
) -> Result<TimelineInfo> {
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
if conf.timeline_path(&new_timeline_id, &tenant_id).exists() {
bail!("timeline {} already exists", new_timeline_id);
}
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));
match ancestor_timeline_id {
Some(ancestor_timeline_id) => {
let ancestor_timeline = repo
.get_timeline(ancestor_timeline_id)
.with_context(|| format!("Cannot get ancestor timeline {}", ancestor_timeline_id))?
.local_timeline()
.with_context(|| {
format!(
"Cannot branch off the timeline {} that's not present locally",
ancestor_timeline_id
)
})?;
if start_lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = ancestor_timeline.get_last_record_lsn();
info!("branching at end of WAL: {}", end_of_wal);
start_lsn = end_of_wal;
} else {
// Wait for the WAL to arrive and be processed on the parent branch up
// to the requested branch point. The repository code itself doesn't
// require it, but if we start to receive WAL on the new timeline,
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline.wait_lsn(start_lsn)?;
}
start_lsn = start_lsn.align();
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
if ancestor_ancestor_lsn > start_lsn {
// can we safely just branch from the ancestor instead?
anyhow::bail!(
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
start_lsn,
ancestor_timeline_id,
ancestor_ancestor_lsn,
);
}
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
// load the timeline into memory
let loaded_timeline = repo.get_timeline(new_timeline_id)?;
Ok(TimelineInfo::from_repo_timeline(
tenant_id,
loaded_timeline,
false,
))
}
None => {
let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
Ok(TimelineInfo::from_dyn_timeline(
tenant_id,
new_timeline_id,
new_timeline.as_ref(),
false,
))
}
}
}