Merge branch 'main' into immutable_bst_layer_map

This commit is contained in:
Bojan Serafimov
2023-01-03 18:02:20 -05:00
267 changed files with 14686 additions and 8618 deletions

View File

@@ -22,7 +22,8 @@ use std::time::SystemTime;
use tar::{Builder, EntryType, Header};
use tracing::*;
use crate::tenant::Timeline;
use crate::task_mgr;
use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
@@ -130,7 +131,7 @@ where
// Create pgdata subdirs structure
for dir in PGDATA_SUBDIRS.iter() {
let header = new_tar_header_dir(*dir)?;
let header = new_tar_header_dir(dir)?;
self.ar.append(&header, &mut io::empty())?;
}
@@ -152,23 +153,29 @@ where
SlruKind::MultiXactOffsets,
SlruKind::MultiXactMembers,
] {
for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
for segno in
with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
{
self.add_slru_segment(kind, segno)?;
}
}
// Create tablespace directories
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
for ((spcnode, dbnode), has_relmap_file) in
with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
{
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
// Gather and send relational files in each database if full backup is requested.
if self.full_backup {
for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
for rel in with_ondemand_download_sync(|| {
self.timeline.list_rels(spcnode, dbnode, self.lsn)
})? {
self.add_rel(rel)?;
}
}
}
for xid in self.timeline.list_twophase_files(self.lsn)? {
for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
self.add_twophase_file(xid)?;
}
@@ -185,7 +192,8 @@ where
}
fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?;
let nblocks =
with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;
// Function that adds relation segment data to archive
let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
@@ -208,7 +216,8 @@ where
for blknum in blocks {
let img = self
.timeline
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)?;
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)
.no_ondemand_download()?;
segment_data.extend_from_slice(&img[..]);
}
@@ -222,13 +231,16 @@ where
// Generate SLRU segment files from repository.
//
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
let nblocks = with_ondemand_download_sync(|| {
self.timeline.get_slru_segment_size(slru, segno, self.lsn)
})?;
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
for blknum in 0..nblocks {
let img = self
.timeline
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
let img = with_ondemand_download_sync(|| {
self.timeline
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
})?;
if slru == SlruKind::Clog {
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
@@ -260,7 +272,9 @@ where
has_relmap_file: bool,
) -> anyhow::Result<()> {
let relmap_img = if has_relmap_file {
let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
let img = with_ondemand_download_sync(|| {
self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
})?;
ensure!(img.len() == 512);
Some(img)
} else {
@@ -295,7 +309,8 @@ where
if !has_relmap_file
&& self
.timeline
.list_rels(spcnode, dbnode, self.lsn)?
.list_rels(spcnode, dbnode, self.lsn)
.no_ondemand_download()?
.is_empty()
{
return Ok(());
@@ -327,7 +342,7 @@ where
// Extract twophase state files
//
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
let img = self.timeline.get_twophase_file(xid, self.lsn)?;
let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);
@@ -361,14 +376,12 @@ where
zenith_signal.as_bytes(),
)?;
let checkpoint_bytes = self
.timeline
.get_checkpoint(self.lsn)
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = self
.timeline
.get_control_file(self.lsn)
.context("failed get control bytes")?;
let checkpoint_bytes =
with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
.context("failed to get checkpoint bytes")?;
let pg_control_bytes =
with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
.context("failed get control bytes")?;
let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
&pg_control_bytes,
@@ -490,3 +503,11 @@ where
}
}
}
fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
where
F: Send + Fn() -> PageReconstructResult<T>,
T: Send,
{
task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
}

View File

@@ -11,8 +11,8 @@
//!
//! Example use:
//! ```
//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
//! $ firefox out.svg
//! ```
//!
@@ -25,6 +25,8 @@ use anyhow::Result;
use pageserver::repository::Key;
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::path::PathBuf;
use std::str::FromStr;
use std::{
collections::{BTreeMap, BTreeSet},
ops::Range,
@@ -65,7 +67,11 @@ fn main() -> Result<()> {
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
let stdin = io::stdin();
for line in stdin.lock().lines() {
let range = parse_filename(&line.unwrap());
let line = line.unwrap();
let line = PathBuf::from_str(&line).unwrap();
let filename = line.file_name().unwrap();
let filename = filename.to_str().unwrap();
let range = parse_filename(filename);
ranges.push(range);
}

View File

@@ -7,7 +7,7 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use nix::unistd::Pid;
use remote_storage::GenericRemoteStorage;
use tracing::*;
use metrics::set_build_info_metric;
@@ -18,14 +18,15 @@ use pageserver::{
task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
},
tenant_mgr, virtual_file,
tenant::mgr,
virtual_file,
};
use remote_storage::GenericRemoteStorage;
use utils::{
auth::JwtAuth,
lock_file, logging,
logging,
postgres_backend::AuthType,
project_git_version,
sentry_init::{init_sentry, release_name},
signals::{self, Signal},
tcp_listener,
};
@@ -85,6 +86,9 @@ fn main() -> anyhow::Result<()> {
}
};
// initialize sentry if SENTRY_DSN is provided
let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
let tenants_path = conf.tenants_path();
if !tenants_path.exists() {
utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
@@ -124,7 +128,7 @@ fn initialize_config(
);
}
// Supplement the CLI arguments with the config file
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| {
let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
format!(
"Failed to read pageserver config at '{}'",
cfg_file_path.display()
@@ -178,7 +182,7 @@ fn initialize_config(
if update_config {
info!("Writing pageserver config to '{}'", cfg_file_path.display());
std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
format!(
"Failed to write pageserver config to '{}'",
cfg_file_path.display()
@@ -198,8 +202,12 @@ fn initialize_config(
}
fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
// Initialize logging
logging::init(conf.log_format)?;
// Print version to the log, and expose it as a prometheus metric too.
info!("version: {}", version());
set_build_info_metric(GIT_VERSION);
// If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes
@@ -215,53 +223,37 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
)
}
// Create and lock PID file. This ensures that there cannot be more than one
// pageserver process running at the same time.
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
lock_file::LockCreationResult::Created {
new_lock_contents,
file,
} => {
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
file
}
lock_file::LockCreationResult::AlreadyLocked {
existing_lock_contents,
} => anyhow::bail!(
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
conf.workdir,
existing_lock_contents
),
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
}
};
// ensure that the lock file is held even if the main thread of the process is panics
// we need to release the lock file only when the current process is gone
let _ = Box::leak(Box::new(lock_file));
let lock_file =
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
info!("Claimed pid file at {lock_file_path:?}");
// TODO: Check that it looks like a valid repository before going further
// Ensure that the lock file is held even if the main thread of the process panics.
// We need to release the lock file only when the process exits.
std::mem::forget(lock_file);
// bind sockets before daemonizing so we report errors early and do not return until we are listening
info!(
"Starting pageserver http handler on {}",
conf.listen_http_addr
);
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
// Bind the HTTP and libpq ports early, so that if they are in use by some other
// process, we error out early.
let http_addr = &conf.listen_http_addr;
info!("Starting pageserver http handler on {http_addr}");
let http_listener = tcp_listener::bind(http_addr)?;
info!(
"Starting pageserver pg protocol handler on {}",
conf.listen_pg_addr
);
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
let pg_addr = &conf.listen_pg_addr;
info!("Starting pageserver pg protocol handler on {pg_addr}");
let pageserver_listener = tcp_listener::bind(pg_addr)?;
// Install signal handlers
let signals = signals::install_shutdown_handlers()?;
// start profiler (if enabled)
// Start profiler (if enabled)
let profiler_guard = profiling::init_profiler(conf);
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?;
// Launch broker client
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
// initialize authentication for incoming connections
// Initialize authentication for incoming connections
let auth = match &conf.auth_type {
AuthType::Trust | AuthType::MD5 => None,
AuthType::NeonJWT => {
@@ -272,46 +264,54 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
};
info!("Using auth: {:#?}", conf.auth_type);
match var("ZENITH_AUTH_TOKEN") {
Ok(v) => {
// TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
(old, Ok(v)) => {
info!("Loaded JWT token for authentication with Safekeeper");
if let Ok(v_old) = old {
warn!(
"JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
);
if v_old != v {
warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
}
}
pageserver::config::SAFEKEEPER_AUTH_TOKEN
.set(Arc::new(v))
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
}
Err(VarError::NotPresent) => {
(Ok(v), _) => {
info!("Loaded JWT token for authentication with Safekeeper");
warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
pageserver::config::SAFEKEEPER_AUTH_TOKEN
.set(Arc::new(v))
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
}
(_, Err(VarError::NotPresent)) => {
info!("No JWT token for authentication with Safekeeper detected");
}
Err(e) => {
(_, Err(e)) => {
return Err(e).with_context(|| {
"Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable"
"Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
})
}
};
let remote_storage = conf
.remote_storage_config
.as_ref()
.map(|storage_config| {
GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config)
})
.transpose()
.context("Failed to init generic remote storage")?;
{
let _rt_guard = BACKGROUND_RUNTIME.enter();
tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
};
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
// bind before launching separate thread so the error reported before startup exits
// Scan the local 'tenants/' directory and start loading the tenants
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
// Create a Service from the router above to handle incoming requests.
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
{
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
let router = http::make_router(conf, auth.clone(), remote_storage)?;
let service =
utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
let router = http::make_router(conf, auth.clone(), remote_storage)?
.build()
.map_err(|err| anyhow!(err))?;
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?
.serve(service)
.with_graceful_shutdown(task_mgr::shutdown_watcher());
@@ -328,10 +328,31 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
Ok(())
},
);
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
task_mgr::spawn(
MGMT_REQUEST_RUNTIME.handle(),
TaskKind::MetricsCollection,
None,
None,
"consumption metrics collection",
true,
async move {
pageserver::consumption_metrics::collect_metrics(
metric_collection_endpoint,
conf.metric_collection_interval,
conf.id,
)
.instrument(info_span!("metrics_collection"))
.await?;
Ok(())
},
);
}
}
// Spawn a task to listen for libpq connections. It will spawn further tasks
// for each connection.
// for each connection. We created the listener earlier already.
task_mgr::spawn(
COMPUTE_REQUEST_RUNTIME.handle(),
TaskKind::LibpqEndpointListener,
@@ -344,8 +365,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
},
);
set_build_info_metric(GIT_VERSION);
// All started up! Now just sit and wait for shutdown signal.
signals.handle(|signal| match signal {
Signal::Quit => {
@@ -369,6 +388,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
})
}
fn create_remote_storage_client(
conf: &'static PageServerConf,
) -> anyhow::Result<Option<GenericRemoteStorage>> {
let config = if let Some(config) = &conf.remote_storage_config {
config
} else {
// No remote storage configured.
return Ok(None);
};
// Create the client
let mut remote_storage = GenericRemoteStorage::from_config(config)?;
// If `test_remote_failures` is non-zero, wrap the client with a
// wrapper that simulates failures.
if conf.test_remote_failures > 0 {
if !cfg!(feature = "testing") {
anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
}
info!(
"Simulating remote failures for first {} attempts of each op",
conf.test_remote_failures
);
remote_storage =
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
}
Ok(Some(remote_storage))
}
fn cli() -> Command {
Command::new("Neon page server")
.about("Materializes WAL stream to pages and serves them to the postgres")

View File

@@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> {
}
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
println!("{control_file:?}");
let control_file_initdb = Lsn(control_file.checkPoint);
println!(
@@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> {
}
fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
let metadata_bytes = std::fs::read(&path)?;
let metadata_bytes = std::fs::read(path)?;
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
@@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
if update_meta {
let metadata_bytes = meta.to_bytes()?;
std::fs::write(&path, &metadata_bytes)?;
std::fs::write(path, metadata_bytes)?;
}
Ok(())

View File

@@ -5,12 +5,14 @@
//! See also `settings.md` for better description on every parameter.
use anyhow::{anyhow, bail, ensure, Context, Result};
use remote_storage::RemoteStorageConfig;
use remote_storage::{RemotePath, RemoteStorageConfig};
use std::env;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use once_cell::sync::OnceCell;
use reqwest::Url;
use std::num::NonZeroUsize;
use std::path::{Path, PathBuf};
use std::str::FromStr;
@@ -18,25 +20,29 @@ use std::sync::Arc;
use std::time::Duration;
use toml_edit;
use toml_edit::{Document, Item};
use url::Url;
use utils::{
id::{NodeId, TenantId, TimelineId},
logging::LogFormat,
postgres_backend::AuthType,
};
use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
use crate::tenant_config::{TenantConf, TenantConfOpt};
use crate::{METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
};
pub mod defaults {
use crate::tenant_config::defaults::*;
use crate::tenant::config::defaults::*;
use const_format::formatcp;
pub use pageserver_api::{
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
DEFAULT_PG_LISTEN_PORT,
};
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
@@ -51,13 +57,14 @@ pub mod defaults {
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
///
/// Default built-in configuration file.
///
pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
r###"
# Initial configuration file created by 'pageserver --init'
#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
@@ -69,10 +76,14 @@ pub mod defaults {
# initial superuser role name to use when creating a new tenant
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}'
#log_format = '{DEFAULT_LOG_FORMAT}'
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -130,24 +141,26 @@ pub struct PageServerConf {
pub profiling: ProfilingConfig,
pub default_tenant_conf: TenantConf,
/// A prefix to add in etcd brokers before every key.
/// Can be used for isolating different pageserver groups within the same etcd cluster.
pub broker_etcd_prefix: String,
/// Etcd broker endpoints to connect to.
pub broker_endpoints: Vec<Url>,
/// Storage broker endpoints to connect to.
pub broker_endpoint: Uri,
pub broker_keepalive_interval: Duration,
pub log_format: LogFormat,
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
// How often to collect metrics and send them to the metrics endpoint.
pub metric_collection_interval: Duration,
pub metric_collection_endpoint: Option<Url>,
pub test_remote_failures: u64,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
/// and/or serialized at a whim, while the token is secret. Currently this token is the
/// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
/// the future, more tokens and auth may arrive for etcd and/or its rewrite (see
/// https://github.com/neondatabase/neon/issues/2394), completely changing the logic.
/// the future, more tokens and auth may arrive for storage broker, completely changing the logic.
/// Hence, we resort to a global variable for now instead of passing the token from the
/// startup code to the connection code through a dozen layers.
pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
@@ -214,12 +227,17 @@ struct PageServerConfigBuilder {
id: BuilderValue<NodeId>,
profiling: BuilderValue<ProfilingConfig>,
broker_etcd_prefix: BuilderValue<String>,
broker_endpoints: BuilderValue<Vec<Url>>,
broker_endpoint: BuilderValue<Uri>,
broker_keepalive_interval: BuilderValue<Duration>,
log_format: BuilderValue<LogFormat>,
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
metric_collection_interval: BuilderValue<Duration>,
metric_collection_endpoint: BuilderValue<Option<Url>>,
test_remote_failures: BuilderValue<u64>,
}
impl Default for PageServerConfigBuilder {
@@ -245,11 +263,23 @@ impl Default for PageServerConfigBuilder {
remote_storage_config: Set(None),
id: NotSet,
profiling: Set(ProfilingConfig::Disabled),
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
broker_endpoints: Set(Vec::new()),
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
.parse()
.expect("failed to parse default broker endpoint")),
broker_keepalive_interval: Set(humantime::parse_duration(
storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
)
.expect("cannot parse default keepalive interval")),
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
metric_collection_interval: Set(humantime::parse_duration(
DEFAULT_METRIC_COLLECTION_INTERVAL,
)
.expect("cannot parse default metric collection interval")),
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
test_remote_failures: Set(0),
}
}
}
@@ -306,12 +336,12 @@ impl PageServerConfigBuilder {
self.remote_storage_config = BuilderValue::Set(remote_storage_config)
}
pub fn broker_endpoints(&mut self, broker_endpoints: Vec<Url>) {
self.broker_endpoints = BuilderValue::Set(broker_endpoints)
pub fn broker_endpoint(&mut self, broker_endpoint: Uri) {
self.broker_endpoint = BuilderValue::Set(broker_endpoint)
}
pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) {
self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
}
pub fn id(&mut self, node_id: NodeId) {
@@ -330,11 +360,19 @@ impl PageServerConfigBuilder {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let broker_endpoints = self
.broker_endpoints
.ok_or(anyhow!("No broker endpoints provided"))?;
pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
}
pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
}
pub fn test_remote_failures(&mut self, fail_first: u64) {
self.test_remote_failures = BuilderValue::Set(fail_first);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
Ok(PageServerConf {
listen_pg_addr: self
.listen_pg_addr
@@ -370,16 +408,27 @@ impl PageServerConfigBuilder {
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
// TenantConf is handled separately
default_tenant_conf: TenantConf::default(),
broker_endpoints,
broker_etcd_prefix: self
.broker_etcd_prefix
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
broker_endpoint: self
.broker_endpoint
.ok_or(anyhow!("No broker endpoints provided"))?,
broker_keepalive_interval: self
.broker_keepalive_interval
.ok_or(anyhow!("No broker keepalive interval provided"))?,
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
concurrent_tenant_size_logical_size_queries: self
.concurrent_tenant_size_logical_size_queries
.ok_or(anyhow!(
"missing concurrent_tenant_size_logical_size_queries"
))?,
metric_collection_interval: self
.metric_collection_interval
.ok_or(anyhow!("missing metric_collection_interval"))?,
metric_collection_endpoint: self
.metric_collection_endpoint
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
test_remote_failures: self
.test_remote_failures
.ok_or(anyhow!("missing test_remote_failuers"))?,
})
}
}
@@ -402,6 +451,10 @@ impl PageServerConf {
.join(TENANT_ATTACHING_MARKER_FILENAME)
}
pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
}
/// Points to a place in pageserver's local directory,
/// where certain tenant's tenantconf file should be located.
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
@@ -450,6 +503,28 @@ impl PageServerConf {
.join(METADATA_FILE_NAME)
}
/// Files on the remote storage are stored with paths, relative to the workdir.
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
///
/// Errors if the path provided does not start from pageserver's workdir.
pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
local_path
.strip_prefix(&self.workdir)
.context("Failed to strip workdir prefix")
.and_then(RemotePath::new)
.with_context(|| {
format!(
"Failed to resolve remote part of path {:?} for base {:?}",
local_path, self.workdir
)
})
}
/// Turns storage remote path of a file into its local path.
pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
remote_path.with_base(&self.workdir)
}
//
// Postgres distribution paths
//
@@ -486,7 +561,7 @@ impl PageServerConf {
let mut builder = PageServerConfigBuilder::default();
builder.workdir(workdir.to_owned());
let mut t_conf: TenantConfOpt = Default::default();
let mut t_conf = TenantConfOpt::default();
for (key, item) in toml.iter() {
match key {
@@ -507,24 +582,15 @@ impl PageServerConf {
)),
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
"remote_storage" => {
builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?))
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
}
"tenant_config" => {
t_conf = Self::parse_toml_tenant_conf(item)?;
}
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
"broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
"broker_endpoints" => builder.broker_endpoints(
parse_toml_array(key, item)?
.into_iter()
.map(|endpoint_str| {
endpoint_str.parse::<Url>().with_context(|| {
format!("Array item {endpoint_str} for key {key} is not a valid url endpoint")
})
})
.collect::<anyhow::Result<_>>()?,
),
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
"log_format" => builder.log_format(
LogFormat::from_config(&parse_toml_string(key, item)?)?
),
@@ -534,6 +600,13 @@ impl PageServerConf {
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
ConfigurableSemaphore::new(permits)
}),
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
"metric_collection_endpoint" => {
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
builder.metric_collection_endpoint(Some(endpoint));
},
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -617,6 +690,12 @@ impl PageServerConf {
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
}
if let Some(trace_read_requests) = item.get("trace_read_requests") {
t_conf.trace_read_requests =
Some(trace_read_requests.as_bool().with_context(|| {
"configure option trace_read_requests is not a bool".to_string()
})?);
}
Ok(t_conf)
}
@@ -644,11 +723,14 @@ impl PageServerConf {
auth_validation_public_key_path: None,
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::dummy_conf(),
broker_endpoints: Vec::new(),
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
default_tenant_conf: TenantConf::default(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5000),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
metric_collection_interval: Duration::from_secs(60),
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
test_remote_failures: 0,
}
}
}
@@ -698,22 +780,6 @@ where
})
}
fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
let array = item
.as_array()
.with_context(|| format!("configure option {name} is not an array"))?;
array
.iter()
.map(|value| {
value
.as_str()
.map(str::to_string)
.with_context(|| format!("Array item {value:?} for key {name} is not a string"))
})
.collect()
}
/// Configurable semaphore permits setting.
///
/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
@@ -795,6 +861,8 @@ max_file_descriptors = 333
initial_superuser_name = 'zzzz'
id = 10
metric_collection_interval = '222 s'
metric_collection_endpoint = 'http://localhost:80/metrics'
log_format = 'json'
"#;
@@ -803,10 +871,10 @@ log_format = 'json'
fn parse_defaults() -> anyhow::Result<()> {
let tempdir = tempdir()?;
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = "http://127.0.0.1:7777";
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
// we have to create dummy values to overcome the validation errors
let config_string = format!(
"pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']",
"pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
pg_distrib_dir.display()
);
let toml = config_string.parse()?;
@@ -832,12 +900,17 @@ log_format = 'json'
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(),
broker_endpoints: vec![broker_endpoint
.parse()
.expect("Failed to parse a valid broker endpoint URL")],
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: humantime::parse_duration(
storage_broker::DEFAULT_KEEPALIVE_INTERVAL
)?,
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
metric_collection_interval: humantime::parse_duration(
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
)?,
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
test_remote_failures: 0,
},
"Correct defaults should be used when no config values are provided"
);
@@ -849,10 +922,10 @@ log_format = 'json'
fn parse_basic_config() -> anyhow::Result<()> {
let tempdir = tempdir()?;
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = "http://127.0.0.1:7777";
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
let config_string = format!(
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']",
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'",
pg_distrib_dir.display()
);
let toml = config_string.parse()?;
@@ -878,12 +951,13 @@ log_format = 'json'
remote_storage_config: None,
profiling: ProfilingConfig::Disabled,
default_tenant_conf: TenantConf::default(),
broker_endpoints: vec![broker_endpoint
.parse()
.expect("Failed to parse a valid broker endpoint URL")],
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
broker_keepalive_interval: Duration::from_secs(5),
log_format: LogFormat::Json,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
metric_collection_interval: Duration::from_secs(222),
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
test_remote_failures: 0,
},
"Should be able to parse all basic config values correctly"
);
@@ -915,7 +989,7 @@ local_path = '{}'"#,
let config_string = format!(
r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{}'
broker_endpoints = ['{broker_endpoint}']
broker_endpoint = '{broker_endpoint}'
{remote_storage_config_str}"#,
pg_distrib_dir.display(),
@@ -982,7 +1056,7 @@ concurrency_limit = {s3_concurrency_limit}"#
let config_string = format!(
r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{}'
broker_endpoints = ['{broker_endpoint}']
broker_endpoint = '{broker_endpoint}'
{remote_storage_config_str}"#,
pg_distrib_dir.display(),
@@ -1016,6 +1090,35 @@ broker_endpoints = ['{broker_endpoint}']
Ok(())
}
#[test]
fn parse_tenant_config() -> anyhow::Result<()> {
let tempdir = tempdir()?;
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = "http://127.0.0.1:7777";
let trace_read_requests = true;
let config_string = format!(
r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{}'
broker_endpoint = '{broker_endpoint}'
[tenant_config]
trace_read_requests = {trace_read_requests}"#,
pg_distrib_dir.display(),
);
let toml = config_string.parse()?;
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
assert_eq!(
conf.default_tenant_conf.trace_read_requests, trace_read_requests,
"Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
);
Ok(())
}
fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
let tempdir_path = tempdir.path();

View File

@@ -0,0 +1,324 @@
//!
//! Periodically collect consumption metrics for all active tenants
//! and push them to a HTTP endpoint.
//! Cache metrics to send only the updated ones.
//!
use anyhow;
use tracing::*;
use utils::id::NodeId;
use utils::id::TimelineId;
use crate::task_mgr;
use crate::tenant::mgr;
use pageserver_api::models::TenantState;
use utils::id::TenantId;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap;
use std::fmt;
use std::str::FromStr;
use std::time::Duration;
use chrono::{DateTime, Utc};
use rand::Rng;
use reqwest::Url;
/// ConsumptionMetric struct that defines the format for one metric entry
/// i.e.
///
/// ```json
/// {
/// "metric": "remote_storage_size",
/// "type": "absolute",
/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
/// "time": "2022-12-28T11:07:19.317310284Z",
/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
/// "value": 12345454,
/// }
/// ```
#[serde_as]
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct ConsumptionMetric {
pub metric: ConsumptionMetricKind,
#[serde(rename = "type")]
pub metric_type: &'static str,
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
pub timeline_id: Option<TimelineId>,
pub time: DateTime<Utc>,
pub idempotency_key: String,
pub value: u64,
}
impl ConsumptionMetric {
pub fn new_absolute<R: Rng + ?Sized>(
metric: ConsumptionMetricKind,
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
value: u64,
node_id: NodeId,
rng: &mut R,
) -> Self {
Self {
metric,
metric_type: "absolute",
tenant_id,
timeline_id,
time: Utc::now(),
// key that allows metric collector to distinguish unique events
idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
value,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ConsumptionMetricKind {
/// Amount of WAL produced , by a timeline, i.e. last_record_lsn
/// This is an absolute, per-timeline metric.
WrittenSize,
/// Size of all tenant branches including WAL
/// This is an absolute, per-tenant metric.
/// This is the same metric that tenant/tenant_id/size endpoint returns.
SyntheticStorageSize,
/// Size of all the layer files in the tenant's directory on disk on the pageserver.
/// This is an absolute, per-tenant metric.
/// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
ResidentSize,
/// Size of the remote storage (S3) directory.
/// This is an absolute, per-tenant metric.
RemoteStorageSize,
/// Logical size of the data in the timeline
/// This is an absolute, per-timeline metric
TimelineLogicalSize,
}
impl FromStr for ConsumptionMetricKind {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"written_size" => Ok(Self::WrittenSize),
"synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
"resident_size" => Ok(Self::ResidentSize),
"remote_storage_size" => Ok(Self::RemoteStorageSize),
"timeline_logical_size" => Ok(Self::TimelineLogicalSize),
_ => anyhow::bail!("invalid value \"{s}\" for metric type"),
}
}
}
impl fmt::Display for ConsumptionMetricKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(match self {
ConsumptionMetricKind::WrittenSize => "written_size",
ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
ConsumptionMetricKind::ResidentSize => "resident_size",
ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
})
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ConsumptionMetricsKey {
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
metric: ConsumptionMetricKind,
}
#[derive(serde::Serialize)]
struct EventChunk<'a> {
events: &'a [ConsumptionMetric],
}
/// Main thread that serves metrics collection
pub async fn collect_metrics(
metric_collection_endpoint: &Url,
metric_collection_interval: Duration,
node_id: NodeId,
) -> anyhow::Result<()> {
let mut ticker = tokio::time::interval(metric_collection_interval);
info!("starting collect_metrics");
// define client here to reuse it for all requests
let client = reqwest::Client::new();
let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
loop {
tokio::select! {
_ = task_mgr::shutdown_watcher() => {
info!("collect_metrics received cancellation request");
return Ok(());
},
_ = ticker.tick() => {
collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?;
}
}
}
}
/// One iteration of metrics collection
///
/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
/// Cache metrics to avoid sending the same metrics multiple times.
pub async fn collect_metrics_task(
client: &reqwest::Client,
cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
metric_collection_endpoint: &reqwest::Url,
node_id: NodeId,
) -> anyhow::Result<()> {
let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
trace!(
"starting collect_metrics_task. metric_collection_endpoint: {}",
metric_collection_endpoint
);
// get list of tenants
let tenants = mgr::list_tenants().await;
// iterate through list of Active tenants and collect metrics
for (tenant_id, tenant_state) in tenants {
if tenant_state != TenantState::Active {
continue;
}
let tenant = mgr::get_tenant(tenant_id, true).await?;
let mut tenant_resident_size = 0;
// iterate through list of timelines in tenant
for timeline in tenant.list_timelines().iter() {
// collect per-timeline metrics only for active timelines
if timeline.is_active() {
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: ConsumptionMetricKind::WrittenSize,
},
timeline_written_size,
));
let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
// Only send timeline logical size when it is fully calculated.
if is_exact {
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: ConsumptionMetricKind::TimelineLogicalSize,
},
timeline_logical_size,
));
}
}
let timeline_resident_size = timeline.get_resident_physical_size();
tenant_resident_size += timeline_resident_size;
}
let tenant_remote_size = tenant.get_remote_size().await?;
debug!(
"collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}",
tenant_id, tenant_state, tenant_resident_size, tenant_remote_size
);
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: ConsumptionMetricKind::ResidentSize,
},
tenant_resident_size,
));
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: ConsumptionMetricKind::RemoteStorageSize,
},
tenant_remote_size,
));
// TODO add SyntheticStorageSize metric
}
// Filter metrics
current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
Some(val) => val != curr_val,
None => true,
});
if current_metrics.is_empty() {
trace!("no new metrics to send");
return Ok(());
}
// Send metrics.
// Split into chunks of 1000 metrics to avoid exceeding the max request size
const CHUNK_SIZE: usize = 1000;
let chunks = current_metrics.chunks(CHUNK_SIZE);
let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
for chunk in chunks {
chunk_to_send.clear();
// this code block is needed to convince compiler
// that rng is not reused aroung await point
{
// enrich metrics with timestamp and metric_kind before sending
let mut rng = rand::thread_rng();
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
ConsumptionMetric::new_absolute(
curr_key.metric,
curr_key.tenant_id,
curr_key.timeline_id,
*curr_val,
node_id,
&mut rng,
)
}));
}
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
events: &chunk_to_send,
})
.expect("ConsumptionMetric should not fail serialization");
let res = client
.post(metric_collection_endpoint.clone())
.json(&chunk_json)
.send()
.await;
match res {
Ok(res) => {
if res.status().is_success() {
// update cached metrics after they were sent successfully
for (curr_key, curr_val) in chunk.iter() {
cached_metrics.insert(curr_key.clone(), *curr_val);
}
} else {
error!("metrics endpoint refused the sent metrics: {:?}", res);
}
}
Err(err) => {
error!("failed to send metrics: {:?}", err);
}
}
}
Ok(())
}

View File

@@ -77,16 +77,6 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
- name: include-non-incremental-physical-size
in: query
schema:
type: string
description: Controls calculation of current_physical_size_non_incremental
get:
description: Get timelines for tenant
responses:
@@ -139,17 +129,6 @@ paths:
format: hex
get:
description: Get info about the timeline
parameters:
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
- name: include-non-incremental-physical-size
in: query
schema:
type: string
description: Controls calculation of current_physical_size_non_incremental
responses:
"200":
description: TimelineInfo
@@ -274,6 +253,7 @@ paths:
schema:
type: string
format: hex
post:
description: Schedules attach operation to happen in the background for given tenant
responses:
@@ -325,7 +305,9 @@ paths:
type: string
format: hex
post:
description: Detach local tenant
description: |
Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
Files on the remote storage are not affected.
responses:
"200":
description: Tenant detached
@@ -354,6 +336,92 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/ignore:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
post:
description: |
Remove tenant data (including all corresponding timelines) from pageserver's memory.
Files on local disk and remote storage are not affected.
Future pageserver restarts won't load the data back until `load` is called on such tenant.
responses:
"200":
description: Tenant ignored
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/load:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
post:
description: |
Schedules an operation that attempts to load a tenant from the local disk and
synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
responses:
"202":
description: Tenant scheduled to load successfully
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/size:
parameters:
- name: tenant_id
@@ -659,7 +727,6 @@ components:
- tenant_id
- last_record_lsn
- disk_consistent_lsn
- awaits_download
- state
- latest_gc_cutoff_lsn
properties:
@@ -691,10 +758,6 @@ components:
type: integer
current_physical_size:
type: integer
current_logical_size_non_incremental:
type: integer
current_physical_size_non_incremental:
type: integer
wal_source_connstr:
type: string
last_received_msg_lsn:
@@ -702,44 +765,11 @@ components:
format: hex
last_received_msg_ts:
type: integer
awaits_download:
type: boolean
state:
type: string
latest_gc_cutoff_lsn:
type: string
format: hex
# These 'local' and 'remote' fields just duplicate some of the fields
# above. They are kept for backwards-compatibility. They can be removed,
# when the control plane has been updated to look at the above fields
# directly.
local:
$ref: "#/components/schemas/LocalTimelineInfo"
remote:
$ref: "#/components/schemas/RemoteTimelineInfo"
LocalTimelineInfo:
type: object
properties:
ancestor_timeline_id:
type: string
format: hex
ancestor_lsn:
type: string
format: hex
current_logical_size:
type: integer
current_physical_size:
type: integer
RemoteTimelineInfo:
type: object
required:
- remote_consistent_lsn
properties:
remote_consistent_lsn:
type: string
format: hex
Error:
type: object
required:

View File

@@ -3,19 +3,18 @@ use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use pageserver_api::models::TenantState;
use remote_storage::GenericRemoteStorage;
use tokio::task::JoinError;
use tokio_util::sync::CancellationToken;
use tracing::*;
use super::models::{
LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
TimelineCreateRequest, TimelineInfo,
};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::tenant::Timeline;
use crate::tenant_config::TenantConfOpt;
use crate::{config::PageServerConf, tenant_mgr};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{with_ondemand_download, Timeline};
use crate::{config::PageServerConf, tenant::mgr};
use utils::{
auth::JwtAuth,
http::{
@@ -32,8 +31,6 @@ use utils::{
// Imports only used for testing APIs
#[cfg(feature = "testing")]
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
#[cfg(feature = "testing")]
use crate::CheckpointConfig;
struct State {
conf: &'static PageServerConf,
@@ -81,28 +78,28 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
}
// Helper function to construct a TimelineInfo struct for a timeline
fn build_timeline_info(
tenant_state: TenantState,
async fn build_timeline_info(
timeline: &Arc<Timeline>,
include_non_incremental_logical_size: bool,
include_non_incremental_physical_size: bool,
) -> anyhow::Result<TimelineInfo> {
let mut info = build_timeline_info_common(tenant_state, timeline)?;
let mut info = build_timeline_info_common(timeline)?;
if include_non_incremental_logical_size {
info.current_logical_size_non_incremental =
Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
}
if include_non_incremental_physical_size {
info.current_physical_size_non_incremental =
Some(timeline.get_physical_size_non_incremental()?)
// XXX we should be using spawn_ondemand_logical_size_calculation here.
// Otherwise, if someone deletes the timeline / detaches the tenant while
// we're executing this function, we will outlive the timeline on-disk state.
info.current_logical_size_non_incremental = Some(
timeline
.get_current_logical_size_non_incremental(
info.last_record_lsn,
CancellationToken::new(),
)
.await?,
);
}
Ok(info)
}
fn build_timeline_info_common(
tenant_state: TenantState,
timeline: &Arc<Timeline>,
) -> anyhow::Result<TimelineInfo> {
fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
let last_record_lsn = timeline.get_last_record_lsn();
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
let guard = timeline.last_received_wal.lock().unwrap();
@@ -123,13 +120,13 @@ fn build_timeline_info_common(
lsn @ Lsn(_) => Some(lsn),
};
let current_logical_size = match timeline.get_current_logical_size() {
Ok(size) => Some(size),
Ok((size, _)) => Some(size),
Err(err) => {
error!("Timeline info creation failed to get current logical size: {err:?}");
None
}
};
let current_physical_size = Some(timeline.get_physical_size());
let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
let state = timeline.current_state();
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
@@ -146,29 +143,13 @@ fn build_timeline_info_common(
current_logical_size,
current_physical_size,
current_logical_size_non_incremental: None,
current_physical_size_non_incremental: None,
timeline_dir_layer_file_size_sum: None,
wal_source_connstr,
last_received_msg_lsn,
last_received_msg_ts,
pg_version: timeline.pg_version,
state,
// XXX bring back tracking of downloads per timeline, or, introduce
// an 'Attaching' state for the timeline and get rid of this field.
awaits_download: tenant_state == TenantState::Attaching,
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
// with the control plane.
local: LocalTimelineInfo {
ancestor_timeline_id,
ancestor_lsn,
current_logical_size,
current_physical_size,
},
remote: RemoteTimelineInfo {
remote_consistent_lsn: Some(remote_consistent_lsn),
},
};
Ok(info)
}
@@ -189,7 +170,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
.new_timeline_id
.unwrap_or_else(TimelineId::generate);
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
match tenant.create_timeline(
new_timeline_id,
request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -200,7 +183,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
.await {
Ok(Some(new_timeline)) => {
// Created. Construct a TimelineInfo for it.
let timeline_info = build_timeline_info_common(tenant.current_state(), &new_timeline)
let timeline_info = build_timeline_info_common(&new_timeline)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
@@ -213,30 +196,30 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size =
query_param_present(&request, "include-non-incremental-logical-size");
let include_non_incremental_physical_size =
query_param_present(&request, "include-non-incremental-physical-size");
check_permission(&request, Some(tenant_id))?;
let _entered = info_span!("timeline_list", tenant = %tenant_id).entered();
let response_data = async {
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timelines = tenant.list_timelines();
let (tenant_state, timelines) = {
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
(tenant.current_state(), tenant.list_timelines())
};
let mut response_data = Vec::with_capacity(timelines.len());
for timeline in timelines {
let timeline_info =
build_timeline_info(&timeline, include_non_incremental_logical_size)
.await
.context(
"Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
)
.map_err(ApiError::InternalServerError)?;
let mut response_data = Vec::with_capacity(timelines.len());
for timeline in timelines {
let timeline_info = build_timeline_info(
tenant_state,
&timeline,
include_non_incremental_logical_size,
include_non_incremental_physical_size,
)
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
.map_err(ApiError::InternalServerError)?;
response_data.push(timeline_info);
response_data.push(timeline_info);
}
Ok(response_data)
}
.instrument(info_span!("timeline_list", tenant = %tenant_id))
.await?;
json_response(StatusCode::OK, response_data)
}
@@ -276,31 +259,21 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let include_non_incremental_logical_size =
query_param_present(&request, "include-non-incremental-logical-size");
let include_non_incremental_physical_size =
query_param_present(&request, "include-non-incremental-physical-size");
check_permission(&request, Some(tenant_id))?;
let timeline_info = async {
let (tenant_state, timeline) = tokio::task::spawn_blocking(move || {
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
Ok((
tenant.current_state(),
tenant.get_timeline(timeline_id, false),
))
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = timeline.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(ApiError::NotFound)?;
let timeline_info = build_timeline_info(
tenant_state,
&timeline,
include_non_incremental_logical_size,
include_non_incremental_physical_size,
)
.context("Failed to get local timeline info: {e:#}")
.map_err(ApiError::InternalServerError)?;
let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
.await
.context("Failed to get local timeline info: {e:#}")
.map_err(ApiError::InternalServerError)?;
Ok::<_, ApiError>(timeline_info)
}
@@ -321,13 +294,15 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
.map_err(ApiError::BadRequest)?;
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
let timeline = tenant_mgr::get_tenant(tenant_id, true)
let timeline = mgr::get_tenant(tenant_id, true)
.await
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
.map_err(ApiError::NotFound)?;
let result = match timeline
.find_lsn_for_timestamp(timestamp_pg)
.map_err(ApiError::InternalServerError)?
{
let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
.await
.map_err(ApiError::InternalServerError)?;
let result = match result {
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
LsnForTimestamp::Future(_lsn) => "future".into(),
LsnForTimestamp::Past(_lsn) => "past".into(),
@@ -347,13 +322,13 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
if let Some(remote_storage) = &state.remote_storage {
// FIXME: distinguish between "Tenant already exists" and other errors
tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage)
mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
.await
.map_err(ApiError::InternalServerError)?;
} else {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is possible because pageserver was configured without remote storage"
"attach_tenant is not possible because pageserver was configured without remote storage"
)));
}
@@ -365,7 +340,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
tenant_mgr::delete_timeline(tenant_id, timeline_id)
mgr::delete_timeline(tenant_id, timeline_id)
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
.await
// FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -382,7 +357,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
let state = get_state(&request);
let conf = state.conf;
tenant_mgr::detach_tenant(conf, tenant_id)
mgr::detach_tenant(conf, tenant_id)
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
.await
// FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
@@ -392,23 +367,49 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, ())
}
async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
.instrument(info_span!("load", tenant = %tenant_id))
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::ACCEPTED, ())
}
async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
let conf = state.conf;
mgr::ignore_tenant(conf, tenant_id)
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
.await
// FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
// Replace this with better handling once the error type permits it.
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_list").entered();
tenant_mgr::list_tenants()
.iter()
.map(|(id, state)| TenantInfo {
id: *id,
state: *state,
current_physical_size: None,
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
})
.collect::<Vec<TenantInfo>>()
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
let response_data = mgr::list_tenants()
.instrument(info_span!("tenant_list"))
.await
.iter()
.map(|(id, state)| TenantInfo {
id: *id,
state: *state,
current_physical_size: None,
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
})
.collect::<Vec<TenantInfo>>();
json_response(StatusCode::OK, response_data)
}
@@ -417,28 +418,25 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant_info = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_status_handler", tenant = %tenant_id).entered();
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
let tenant_info = async {
let tenant = mgr::get_tenant(tenant_id, false).await?;
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
for timeline in tenant.list_timelines().iter() {
current_physical_size += timeline.get_physical_size();
current_physical_size += timeline.layer_size_sum().approximate_is_ok();
}
let state = tenant.current_state();
let tenant_info = TenantInfo {
Ok(TenantInfo {
id: tenant_id,
state,
current_physical_size: Some(current_physical_size),
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
};
Ok::<_, anyhow::Error>(tenant_info)
})
})
}
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, tenant_info)
@@ -448,7 +446,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::InternalServerError)?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::InternalServerError)?;
// this can be long operation, it currently is not backed by any request coalescing or similar
let inputs = tenant
@@ -565,22 +565,19 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
.map(TenantId::from)
.unwrap_or_else(TenantId::generate);
let new_tenant = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
let state = get_state(&request);
let state = get_state(&request);
tenant_mgr::create_tenant(
state.conf,
tenant_conf,
target_tenant_id,
state.remote_storage.clone(),
)
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
// with better error handling once the type permits it
.map_err(ApiError::InternalServerError)
})
let new_tenant = mgr::create_tenant(
state.conf,
tenant_conf,
target_tenant_id,
state.remote_storage.clone(),
)
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
// with better error handling once the type permits it
.map_err(ApiError::InternalServerError)?;
Ok(match new_tenant {
Some(tenant) => {
@@ -671,17 +668,13 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
);
}
tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
let state = get_state(&request);
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
// Replace this `map_err` with better error handling once the type permits it
.map_err(ApiError::InternalServerError)
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
let state = get_state(&request);
mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
.await
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
// Replace this `map_err` with better error handling once the type permits it
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
@@ -728,7 +721,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req)?;
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
let gc_result = wait_task_done
.await
.context("wait for gc task")
@@ -745,7 +738,9 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
@@ -764,18 +759,63 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
timeline
.checkpoint(CheckpointConfig::Forced)
.freeze_and_flush()
.await
.map_err(ApiError::InternalServerError)?;
timeline
.compact()
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn timeline_download_remote_layers_handler_post(
request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
match timeline.spawn_download_all_remote_layers().await {
Ok(st) => json_response(StatusCode::ACCEPTED, st),
Err(st) => json_response(StatusCode::CONFLICT, st),
}
}
async fn timeline_download_remote_layers_handler_get(
request: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
let info = timeline
.get_download_all_remote_layers_task_info()
.context("task never started since last pageserver process start")
.map_err(ApiError::NotFound)?;
json_response(StatusCode::OK, info)
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(
StatusCode::NOT_FOUND,
@@ -838,6 +878,8 @@ pub fn make_router(
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
.post("/v1/tenant/:tenant_id/load", tenant_load_handler)
.post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_detail_handler,
@@ -858,6 +900,14 @@ pub fn make_router(
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
timeline_download_remote_layers_handler_post,
)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
timeline_download_remote_layers_handler_get,
)
.delete(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_delete_handler,

View File

@@ -187,13 +187,13 @@ fn import_slru<Reader: Read>(
path: &Path,
mut reader: Reader,
len: usize,
) -> Result<()> {
trace!("importing slru file {}", path.display());
) -> anyhow::Result<()> {
info!("importing slru file {path:?}");
let mut buf: [u8; 8192] = [0u8; 8192];
let filename = &path
.file_name()
.expect("missing slru filename")
.with_context(|| format!("missing slru filename for path {path:?}"))?
.to_string_lossy();
let segno = u32::from_str_radix(filename, 16)?;
@@ -237,14 +237,19 @@ fn import_slru<Reader: Read>(
/// Scan PostgreSQL WAL files in given directory and load all records between
/// 'startpoint' and 'endpoint' into the repository.
fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
fn import_wal(
walpath: &Path,
tline: &Timeline,
startpoint: Lsn,
endpoint: Lsn,
) -> anyhow::Result<()> {
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
let mut last_lsn = startpoint;
let mut walingest = WalIngest::new(tline, startpoint)?;
let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;
while last_lsn <= endpoint {
// FIXME: assume postgresql tli 1 for now
@@ -267,7 +272,7 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
}
let nread = file.read_to_end(&mut buf)?;
if nread != WAL_SEGMENT_SIZE - offset as usize {
if nread != WAL_SEGMENT_SIZE - offset {
// Maybe allow this for .partial files?
error!("read only {} bytes from WAL file", nread);
}
@@ -279,7 +284,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
let mut decoded = DecodedWALRecord::default();
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
.no_ondemand_download()?;
last_lsn = lsn;
nrecords += 1;
@@ -360,7 +367,7 @@ pub fn import_wal_from_tar<Reader: Read>(
let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
let mut last_lsn = start_lsn;
let mut walingest = WalIngest::new(tline, start_lsn)?;
let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;
// Ingest wal until end_lsn
info!("importing wal until {}", end_lsn);
@@ -405,7 +412,9 @@ pub fn import_wal_from_tar<Reader: Read>(
let mut decoded = DecodedWALRecord::default();
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
.no_ondemand_download()?;
last_lsn = lsn;
debug!("imported record at {} (end {})", lsn, end_lsn);
@@ -440,16 +449,22 @@ fn import_file<Reader: Read>(
reader: Reader,
len: usize,
) -> Result<Option<ControlFileData>> {
let file_name = match file_path.file_name() {
Some(name) => name.to_string_lossy(),
None => return Ok(None),
};
if file_name.starts_with('.') {
// tar archives on macOs, created without COPYFILE_DISABLE=1 env var
// will contain "fork files", skip them.
return Ok(None);
}
if file_path.starts_with("global") {
let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
let dbnode = 0;
match file_path
.file_name()
.expect("missing filename")
.to_string_lossy()
.as_ref()
{
match file_name.as_ref() {
"pg_control" => {
let bytes = read_all_bytes(reader)?;
@@ -485,12 +500,7 @@ fn import_file<Reader: Read>(
.to_string_lossy()
.parse()?;
match file_path
.file_name()
.expect("missing base filename")
.to_string_lossy()
.as_ref()
{
match file_name.as_ref() {
"pg_filenode.map" => {
let bytes = read_all_bytes(reader)?;
modification.put_relmap_file(spcnode, dbnode, bytes)?;
@@ -520,11 +530,7 @@ fn import_file<Reader: Read>(
import_slru(modification, slru, file_path, reader, len)?;
debug!("imported multixact members slru");
} else if file_path.starts_with("pg_twophase") {
let file_name = &file_path
.file_name()
.expect("missing twophase filename")
.to_string_lossy();
let xid = u32::from_str_radix(file_name, 16)?;
let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
let bytes = read_all_bytes(reader)?;
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;

View File

@@ -1,6 +1,7 @@
mod auth;
pub mod basebackup;
pub mod config;
pub mod consumption_metrics;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
@@ -10,13 +11,8 @@ pub mod page_service;
pub mod pgdatadir_mapping;
pub mod profiling;
pub mod repository;
pub mod storage_sync2;
pub use storage_sync2 as storage_sync;
pub mod task_mgr;
pub mod tenant;
pub mod tenant_config;
pub mod tenant_mgr;
pub mod tenant_tasks;
pub mod trace;
pub mod virtual_file;
pub mod walingest;
@@ -26,9 +22,8 @@ pub mod walredo;
use std::path::Path;
use tracing::info;
use crate::task_mgr::TaskKind;
use tracing::info;
/// Current storage format version
///
@@ -47,15 +42,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
/// Config for the Repository checkpointer
#[derive(Debug, Clone, Copy)]
pub enum CheckpointConfig {
// Flush all in-memory data
Flush,
// Flush all in-memory data and reconstruct all page images
Forced,
}
pub async fn shutdown_pageserver(exit_code: i32) {
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
@@ -66,7 +52,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
// Shut down all the tenants. This flushes everything to disk and kills
// the checkpoint and GC tasks.
tenant_mgr::shutdown_all_tenants().await;
tenant::mgr::shutdown_all_tenants().await;
// Stop syncing with remote storage.
//
@@ -99,7 +85,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
}
}
fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
if n == 0 {
0.0
} else {
@@ -125,6 +111,13 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
/// A marker file to prevent pageserver from loading a certain tenant on restart.
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
/// into pageserver's memory before being ignored.
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
pub fn is_temporary(path: &Path) -> bool {
match path.file_name() {
Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),

View File

@@ -84,13 +84,20 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
// Metrics for determining timeline's physical size.
// A layered timeline's physical is defined as the total size of
// (delta/image) layer files on disk.
static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_current_physical_size",
"Current physical size grouped by timeline",
"pageserver_resident_physical_size",
"The size of the layer files present in the pageserver's filesystem.",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_remote_physical_size",
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
// Corollary: If any files are missing from the index part, they won't be included here.
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
@@ -136,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
1.0, // 1 sec
];
const STORAGE_IO_TIME_OPERATIONS: &[&str] =
&["open", "close", "read", "write", "seek", "fsync", "gc"];
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
"open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
];
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
@@ -201,7 +209,7 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
// remote storage metrics
pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_remote_upload_queue_unfinished_tasks",
"Number of tasks in the upload queue that are not finished yet.",
@@ -210,14 +218,14 @@ pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|
.expect("failed to define a metric")
});
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
Download,
Delete,
}
impl RemoteOpKind {
pub fn as_str(&self) -> &str {
pub fn as_str(&self) -> &'static str {
match self {
Self::Upload => "upload",
Self::Download => "download",
@@ -226,13 +234,13 @@ impl RemoteOpKind {
}
}
#[derive(Debug, Clone, Copy)]
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum RemoteOpFileKind {
Layer,
Index,
}
impl RemoteOpFileKind {
pub fn as_str(&self) -> &str {
pub fn as_str(&self) -> &'static str {
match self {
Self::Layer => "layer",
Self::Index => "index",
@@ -365,7 +373,7 @@ pub struct TimelineMetrics {
pub load_layer_map_histo: Histogram,
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub current_physical_size_gauge: UIntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
@@ -406,7 +414,7 @@ impl TimelineMetrics {
let wait_lsn_time_histo = WAIT_LSN_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
@@ -432,7 +440,7 @@ impl TimelineMetrics {
load_layer_map_histo,
last_record_gauge,
wait_lsn_time_histo,
current_physical_size_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
num_persistent_files_created,
persistent_bytes_written,
@@ -448,7 +456,7 @@ impl Drop for TimelineMetrics {
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -491,10 +499,114 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) {
use futures::Future;
use pin_project_lite::pin_project;
use std::collections::HashMap;
use std::pin::Pin;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::Instant;
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
}
impl RemoteTimelineClientMetrics {
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
RemoteTimelineClientMetrics {
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
remote_operation_time: Mutex::new(HashMap::default()),
unfinished_tasks: Mutex::new(HashMap::default()),
remote_physical_size_gauge: Mutex::new(None),
}
}
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
guard
.get_or_insert_with(|| {
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
])
.unwrap()
})
.clone()
}
pub fn remote_operation_time(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
status: &'static str,
) -> Histogram {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.remote_operation_time.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str(), status);
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_OPERATION_TIME
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
key.2,
])
.unwrap()
});
metric.clone()
}
pub fn unfinished_tasks(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> IntGauge {
// XXX would be nice to have an upgradable RwLock
let mut guard = self.unfinished_tasks.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str());
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
.unwrap()
});
metric.clone()
}
}
impl Drop for RemoteTimelineClientMetrics {
fn drop(&mut self) {
let RemoteTimelineClientMetrics {
tenant_id,
timeline_id,
remote_physical_size_gauge,
remote_operation_time,
unfinished_tasks,
} = self;
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
}
for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
tenant_id,
timeline_id,
a,
b,
]);
}
{
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
}
}
}
/// Wrapper future that measures the time spent by a remote storage operation,
/// and records the time and success/failure as a prometheus metric.
pub trait MeasureRemoteOp: Sized {
@@ -504,6 +616,7 @@ pub trait MeasureRemoteOp: Sized {
timeline_id: TimelineId,
file_kind: RemoteOpFileKind,
op: RemoteOpKind,
metrics: Arc<RemoteTimelineClientMetrics>,
) -> MeasuredRemoteOp<Self> {
let start = Instant::now();
MeasuredRemoteOp {
@@ -513,6 +626,7 @@ pub trait MeasureRemoteOp: Sized {
file_kind,
op,
start,
metrics,
}
}
}
@@ -529,6 +643,7 @@ pin_project! {
file_kind: RemoteOpFileKind,
op: RemoteOpKind,
start: Instant,
metrics: Arc<RemoteTimelineClientMetrics>,
}
}
@@ -541,15 +656,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
if let Poll::Ready(ref res) = poll_result {
let duration = this.start.elapsed();
let status = if res.is_ok() { &"success" } else { &"failure" };
REMOTE_OPERATION_TIME
.get_metric_with_label_values(&[
&this.tenant_id.to_string(),
&this.timeline_id.to_string(),
this.file_kind.as_str(),
this.op.as_str(),
status,
])
.unwrap()
this.metrics
.remote_operation_time(this.file_kind, this.op, status)
.observe(duration.as_secs_f64());
}
poll_result

View File

@@ -48,10 +48,9 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
use crate::profiling::profpoint_start;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::mgr;
use crate::tenant::{Tenant, Timeline};
use crate::tenant_mgr;
use crate::trace::Tracer;
use crate::CheckpointConfig;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
@@ -445,9 +444,7 @@ impl PageServerHandler {
pgb.flush().await?;
let mut copyin_stream = Box::pin(copyin_stream(pgb));
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| {
import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)
})?;
tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
info!("wal import complete");
// Drain the rest of the Copy data
@@ -466,7 +463,7 @@ impl PageServerHandler {
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.checkpoint(CheckpointConfig::Flush).await?;
timeline.freeze_and_flush().await?;
info!("done");
Ok(())
@@ -542,7 +539,10 @@ impl PageServerHandler {
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?;
let exists = crate::tenant::with_ondemand_download(|| {
timeline.get_rel_exists(req.rel, lsn, req.latest)
})
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
exists,
@@ -559,7 +559,10 @@ impl PageServerHandler {
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?;
let n_blocks = crate::tenant::with_ondemand_download(|| {
timeline.get_rel_size(req.rel, lsn, req.latest)
})
.await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
n_blocks,
@@ -576,9 +579,10 @@ impl PageServerHandler {
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
.await?;
let total_blocks =
timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?;
let total_blocks = crate::tenant::with_ondemand_download(|| {
timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
})
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
@@ -604,11 +608,14 @@ impl PageServerHandler {
}
*/
// FIXME: this profiling now happens at different place than it used to. The
// current profiling is based on a thread-local variable, so it doesn't work
// across awaits
let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;
let page = crate::tenant::with_ondemand_download(|| {
// FIXME: this profiling now happens at different place than it used to. The
// current profiling is based on a thread-local variable, so it doesn't work
// across awaits
let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
})
.await?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
@@ -649,7 +656,7 @@ impl PageServerHandler {
tokio::task::block_in_place(|| {
let basebackup =
basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str());
tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
basebackup.send_tarball()
})?;
pgb.write_message(&BeMessage::CopyDone)?;
@@ -941,7 +948,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
/// ensures that queries don't fail immediately after pageserver startup, because
/// all tenants are still loading.
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
let tenant = mgr::get_tenant(tenant_id, false).await?;
match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
Ok(wait_result) => wait_result
// no .context(), the error message is good enough and some tests depend on it

View File

@@ -6,11 +6,12 @@
//! walingest.rs handles a few things like implicit relation creation and extension.
//! Clarify that)
//!
use super::tenant::PageReconstructResult;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::repository::*;
use crate::tenant::Timeline;
use crate::tenant::{with_ondemand_download, Timeline};
use crate::walrecord::NeonWalRecord;
use anyhow::{bail, ensure, Result};
use crate::{repository::*, try_no_ondemand_download};
use anyhow::Context;
use bytes::{Buf, Bytes};
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -19,6 +20,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
use serde::{Deserialize, Serialize};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::Range;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -33,6 +35,14 @@ pub enum LsnForTimestamp {
NoData(Lsn),
}
#[derive(Debug, thiserror::Error)]
pub enum CalculateLogicalSizeError {
#[error("cancelled")]
Cancelled,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
///
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
/// and other special kinds of files, in a versioned key-value store. The
@@ -88,16 +98,18 @@ impl Timeline {
blknum: BlockNumber,
lsn: Lsn,
latest: bool,
) -> Result<Bytes> {
ensure!(tag.relnode != 0, "invalid relnode");
) -> PageReconstructResult<Bytes> {
if tag.relnode == 0 {
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
}
let nblocks = self.get_rel_size(tag, lsn, latest)?;
let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag, blknum, lsn, nblocks
);
return Ok(ZERO_PAGE.clone());
return PageReconstructResult::Success(ZERO_PAGE.clone());
}
let key = rel_block_to_key(tag, blknum);
@@ -105,38 +117,51 @@ impl Timeline {
}
// Get size of a database in blocks
pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result<usize> {
pub fn get_db_size(
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
latest: bool,
) -> PageReconstructResult<usize> {
let mut total_blocks = 0;
let rels = self.list_rels(spcnode, dbnode, lsn)?;
let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));
for rel in rels {
let n_blocks = self.get_rel_size(rel, lsn, latest)?;
let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
total_blocks += n_blocks as usize;
}
Ok(total_blocks)
PageReconstructResult::Success(total_blocks)
}
/// Get size of a relation file
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result<BlockNumber> {
ensure!(tag.relnode != 0, "invalid relnode");
pub fn get_rel_size(
&self,
tag: RelTag,
lsn: Lsn,
latest: bool,
) -> PageReconstructResult<BlockNumber> {
if tag.relnode == 0 {
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(nblocks);
return PageReconstructResult::Success(nblocks);
}
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
&& !self.get_rel_exists(tag, lsn, latest)?
&& !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
{
// FIXME: Postgres sometimes calls smgrcreate() to create
// FSM, and smgrnblocks() on it immediately afterwards,
// without extending it. Tolerate that by claiming that
// any non-existent FSM fork has size 0.
return Ok(0);
return PageReconstructResult::Success(0);
}
let key = rel_size_to_key(tag);
let mut buf = self.get(key, lsn)?;
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
let nblocks = buf.get_u32_le();
if latest {
@@ -149,43 +174,62 @@ impl Timeline {
// associated with most recent value of LSN.
self.update_cached_rel_size(tag, lsn, nblocks);
}
Ok(nblocks)
PageReconstructResult::Success(nblocks)
}
/// Does relation exist?
pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result<bool> {
ensure!(tag.relnode != 0, "invalid relnode");
pub fn get_rel_exists(
&self,
tag: RelTag,
lsn: Lsn,
_latest: bool,
) -> PageReconstructResult<bool> {
if tag.relnode == 0 {
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
}
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(true);
return PageReconstructResult::Success(true);
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = self.get(key, lsn)?;
let dir = RelDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(key, lsn));
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
Ok(exists)
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
PageReconstructResult::Success(exists)
}
Err(e) => PageReconstructResult::from(e),
}
}
/// Get a list of all existing relations in given tablespace and database.
pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
pub fn list_rels(
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
) -> PageReconstructResult<HashSet<RelTag>> {
// fetch directory listing
let key = rel_dir_to_key(spcnode, dbnode);
let buf = self.get(key, lsn)?;
let dir = RelDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(key, lsn));
let rels: HashSet<RelTag> =
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
spcnode,
dbnode,
relnode: *relnode,
forknum: *forknum,
}));
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let rels: HashSet<RelTag> =
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
spcnode,
dbnode,
relnode: *relnode,
forknum: *forknum,
}));
Ok(rels)
PageReconstructResult::Success(rels)
}
Err(e) => PageReconstructResult::from(e),
}
}
/// Look up given SLRU page version.
@@ -195,7 +239,7 @@ impl Timeline {
segno: u32,
blknum: BlockNumber,
lsn: Lsn,
) -> Result<Bytes> {
) -> PageReconstructResult<Bytes> {
let key = slru_block_to_key(kind, segno, blknum);
self.get(key, lsn)
}
@@ -206,21 +250,30 @@ impl Timeline {
kind: SlruKind,
segno: u32,
lsn: Lsn,
) -> Result<BlockNumber> {
) -> PageReconstructResult<BlockNumber> {
let key = slru_segment_size_to_key(kind, segno);
let mut buf = self.get(key, lsn)?;
Ok(buf.get_u32_le())
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
PageReconstructResult::Success(buf.get_u32_le())
}
/// Get size of an SLRU segment
pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
pub fn get_slru_segment_exists(
&self,
kind: SlruKind,
segno: u32,
lsn: Lsn,
) -> PageReconstructResult<bool> {
// fetch directory listing
let key = slru_dir_to_key(kind);
let buf = self.get(key, lsn)?;
let dir = SlruSegmentDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(key, lsn));
let exists = dir.segments.get(&segno).is_some();
Ok(exists)
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
let exists = dir.segments.get(&segno).is_some();
PageReconstructResult::Success(exists)
}
Err(e) => PageReconstructResult::from(e),
}
}
/// Locate LSN, such that all transactions that committed before
@@ -230,7 +283,10 @@ impl Timeline {
/// so it's not well defined which LSN you get if there were multiple commits
/// "in flight" at that point in time.
///
pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
pub fn find_lsn_for_timestamp(
&self,
search_timestamp: TimestampTz,
) -> PageReconstructResult<LsnForTimestamp> {
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
let min_lsn = *gc_cutoff_lsn_guard;
let max_lsn = self.get_last_record_lsn();
@@ -246,12 +302,12 @@ impl Timeline {
// cannot overflow, high and low are both smaller than u64::MAX / 2
let mid = (high + low) / 2;
let cmp = self.is_latest_commit_timestamp_ge_than(
let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
search_timestamp,
Lsn(mid * 8),
&mut found_smaller,
&mut found_larger,
)?;
));
if cmp {
high = mid;
@@ -263,15 +319,15 @@ impl Timeline {
(false, false) => {
// This can happen if no commit records have been processed yet, e.g.
// just after importing a cluster.
Ok(LsnForTimestamp::NoData(max_lsn))
PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
}
(true, false) => {
// Didn't find any commit timestamps larger than the request
Ok(LsnForTimestamp::Future(max_lsn))
PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
}
(false, true) => {
// Didn't find any commit timestamps smaller than the request
Ok(LsnForTimestamp::Past(max_lsn))
PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
}
(true, true) => {
// low is the LSN of the first commit record *after* the search_timestamp,
@@ -281,7 +337,7 @@ impl Timeline {
// Otherwise, if you restore to the returned LSN, the database will
// include physical changes from later commits that will be marked
// as aborted, and will need to be vacuumed away.
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
}
}
}
@@ -299,12 +355,20 @@ impl Timeline {
probe_lsn: Lsn,
found_smaller: &mut bool,
found_larger: &mut bool,
) -> Result<bool> {
for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
) -> PageReconstructResult<bool> {
for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
SlruKind::Clog,
segno,
probe_lsn
));
for blknum in (0..nblocks).rev() {
let clog_page =
self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
SlruKind::Clog,
segno,
blknum,
probe_lsn
));
if clog_page.len() == BLCKSZ as usize + 8 {
let mut timestamp_bytes = [0u8; 8];
@@ -313,61 +377,75 @@ impl Timeline {
if timestamp >= search_timestamp {
*found_larger = true;
return Ok(true);
return PageReconstructResult::Success(true);
} else {
*found_smaller = true;
}
}
}
}
Ok(false)
PageReconstructResult::Success(false)
}
/// Get a list of SLRU segments
pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
pub fn list_slru_segments(
&self,
kind: SlruKind,
lsn: Lsn,
) -> PageReconstructResult<HashSet<u32>> {
// fetch directory entry
let key = slru_dir_to_key(kind);
let buf = self.get(key, lsn)?;
let dir = SlruSegmentDirectory::des(&buf)?;
Ok(dir.segments)
let buf = try_no_ondemand_download!(self.get(key, lsn));
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => PageReconstructResult::Success(dir.segments),
Err(e) => PageReconstructResult::from(e),
}
}
pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
pub fn get_relmap_file(
&self,
spcnode: Oid,
dbnode: Oid,
lsn: Lsn,
) -> PageReconstructResult<Bytes> {
let key = relmap_file_key(spcnode, dbnode);
let buf = self.get(key, lsn)?;
Ok(buf)
let buf = try_no_ondemand_download!(self.get(key, lsn));
PageReconstructResult::Success(buf)
}
pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
// fetch directory entry
let buf = self.get(DBDIR_KEY, lsn)?;
let dir = DbDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));
Ok(dir.dbdirs)
match DbDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
Err(e) => PageReconstructResult::from(e),
}
}
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
let key = twophase_file_key(xid);
let buf = self.get(key, lsn)?;
Ok(buf)
let buf = try_no_ondemand_download!(self.get(key, lsn));
PageReconstructResult::Success(buf)
}
pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
// fetch directory entry
let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
let dir = TwoPhaseDirectory::des(&buf)?;
let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));
Ok(dir.xids)
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => PageReconstructResult::Success(dir.xids),
Err(e) => PageReconstructResult::from(e),
}
}
pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
self.get(CONTROLFILE_KEY, lsn)
}
pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
self.get(CHECKPOINT_KEY, lsn)
}
@@ -376,16 +454,26 @@ impl Timeline {
///
/// Only relation blocks are counted currently. That excludes metadata,
/// SLRUs, twophase files etc.
pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
pub async fn get_current_logical_size_non_incremental(
&self,
lsn: Lsn,
cancel: CancellationToken,
) -> Result<u64, CalculateLogicalSizeError> {
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn)?;
let dbdir = DbDirectory::des(&buf)?;
let buf = self.get_download(DBDIR_KEY, lsn).await?;
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
for rel in
crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
.await?
{
if cancel.is_cancelled() {
return Err(CalculateLogicalSizeError::Cancelled);
}
let relsize_key = rel_size_to_key(rel);
let mut buf = self.get(relsize_key, lsn)?;
let mut buf = self.get_download(relsize_key, lsn).await?;
let relsize = buf.get_u32_le();
total_size += relsize as u64;
@@ -398,7 +486,7 @@ impl Timeline {
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();
@@ -406,8 +494,8 @@ impl Timeline {
result.add_key(DBDIR_KEY);
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn)?;
let dbdir = DbDirectory::des(&buf)?;
let buf = self.get_download(DBDIR_KEY, lsn).await?;
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
dbs.sort_unstable();
@@ -415,15 +503,15 @@ impl Timeline {
result.add_key(relmap_file_key(spcnode, dbnode));
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec<RelTag> = self
.list_rels(spcnode, dbnode, lsn)?
.iter()
.cloned()
.collect();
let mut rels: Vec<RelTag> =
with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
.await?
.into_iter()
.collect();
rels.sort_unstable();
for rel in rels {
let relsize_key = rel_size_to_key(rel);
let mut buf = self.get(relsize_key, lsn)?;
let mut buf = self.get_download(relsize_key, lsn).await?;
let relsize = buf.get_u32_le();
result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -439,13 +527,13 @@ impl Timeline {
] {
let slrudir_key = slru_dir_to_key(kind);
result.add_key(slrudir_key);
let buf = self.get(slrudir_key, lsn)?;
let dir = SlruSegmentDirectory::des(&buf)?;
let buf = self.get_download(slrudir_key, lsn).await?;
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
segments.sort_unstable();
for segno in segments {
let segsize_key = slru_segment_size_to_key(kind, segno);
let mut buf = self.get(segsize_key, lsn)?;
let mut buf = self.get_download(segsize_key, lsn).await?;
let segsize = buf.get_u32_le();
result.add_range(
@@ -457,8 +545,8 @@ impl Timeline {
// Then pg_twophase
result.add_key(TWOPHASEDIR_KEY);
let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
let twophase_dir = TwoPhaseDirectory::des(&buf)?;
let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
xids.sort_unstable();
for xid in xids {
@@ -537,7 +625,7 @@ impl<'a> DatadirModification<'a> {
///
/// This inserts the directory metadata entries that are assumed to
/// always exist.
pub fn init_empty(&mut self) -> Result<()> {
pub fn init_empty(&mut self) -> anyhow::Result<()> {
let buf = DbDirectory::ser(&DbDirectory {
dbdirs: HashMap::new(),
})?;
@@ -570,8 +658,8 @@ impl<'a> DatadirModification<'a> {
rel: RelTag,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
Ok(())
}
@@ -583,7 +671,7 @@ impl<'a> DatadirModification<'a> {
segno: u32,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> Result<()> {
) -> anyhow::Result<()> {
self.put(
slru_block_to_key(kind, segno, blknum),
Value::WalRecord(rec),
@@ -597,8 +685,8 @@ impl<'a> DatadirModification<'a> {
rel: RelTag,
blknum: BlockNumber,
img: Bytes,
) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
Ok(())
}
@@ -609,26 +697,26 @@ impl<'a> DatadirModification<'a> {
segno: u32,
blknum: BlockNumber,
img: Bytes,
) -> Result<()> {
) -> anyhow::Result<()> {
self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
Ok(())
}
/// Store a relmapper file (pg_filenode.map) in the repository
pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> {
pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
// Add it to the directory (if it doesn't exist already)
let buf = self.get(DBDIR_KEY)?;
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
let mut dbdir = DbDirectory::des(&buf)?;
let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
if r == None || r == Some(false) {
if r.is_none() || r == Some(false) {
// The dbdir entry didn't exist, or it contained a
// 'false'. The 'insert' call already updated it with
// 'true', now write the updated 'dbdirs' map back.
let buf = DbDirectory::ser(&dbdir)?;
self.put(DBDIR_KEY, Value::Image(buf.into()));
}
if r == None {
if r.is_none() {
// Create RelDirectory
let buf = RelDirectory::ser(&RelDirectory {
rels: HashSet::new(),
@@ -643,12 +731,12 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> {
pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
// Add it to the directory entry
let buf = self.get(TWOPHASEDIR_KEY)?;
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.insert(xid) {
bail!("twophase file for xid {} already exists", xid);
anyhow::bail!("twophase file for xid {} already exists", xid);
}
self.put(
TWOPHASEDIR_KEY,
@@ -659,23 +747,26 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub fn put_control_file(&mut self, img: Bytes) -> Result<()> {
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
self.put(CONTROLFILE_KEY, Value::Image(img));
Ok(())
}
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> {
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
self.put(CHECKPOINT_KEY, Value::Image(img));
Ok(())
}
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
let req_lsn = self.tline.get_last_record_lsn();
let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
let total_blocks = self
.tline
.get_db_size(spcnode, dbnode, req_lsn, true)
.no_ondemand_download()?;
// Remove entry from dbdir
let buf = self.get(DBDIR_KEY)?;
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
let mut dir = DbDirectory::des(&buf)?;
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
let buf = DbDirectory::ser(&dir)?;
@@ -698,11 +789,11 @@ impl<'a> DatadirModification<'a> {
/// Create a relation fork.
///
/// 'nblocks' is the initial size.
pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?;
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
// Didn't exist. Update dbdir
@@ -714,12 +805,12 @@ impl<'a> DatadirModification<'a> {
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key)?)?
RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
bail!("rel {} already exists", rel);
anyhow::bail!("rel {rel} already exists");
}
self.put(
rel_dir_key,
@@ -742,13 +833,17 @@ impl<'a> DatadirModification<'a> {
}
/// Truncate relation
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
let last_lsn = self.tline.get_last_record_lsn();
if self.tline.get_rel_exists(rel, last_lsn, true)? {
if self
.tline
.get_rel_exists(rel, last_lsn, true)
.no_ondemand_download()?
{
let size_key = rel_size_to_key(rel);
// Fetch the old size first
let old_size = self.get(size_key)?.get_u32_le();
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
// Update the entry with the new size.
let buf = nblocks.to_le_bytes();
@@ -768,12 +863,12 @@ impl<'a> DatadirModification<'a> {
/// Extend relation
/// If new size is smaller, do nothing.
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// Put size
let size_key = rel_size_to_key(rel);
let old_size = self.get(size_key)?.get_u32_le();
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
// only extend relation here. never decrease the size
if nblocks > old_size {
@@ -789,12 +884,12 @@ impl<'a> DatadirModification<'a> {
}
/// Drop a relation.
pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> {
ensure!(rel.relnode != 0, "invalid relnode");
pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
// Remove it from the directory entry
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let buf = self.get(dir_key)?;
let buf = self.get(dir_key).no_ondemand_download()?;
let mut dir = RelDirectory::des(&buf)?;
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -805,7 +900,7 @@ impl<'a> DatadirModification<'a> {
// update logical size
let size_key = rel_size_to_key(rel);
let old_size = self.get(size_key)?.get_u32_le();
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
self.pending_nblocks -= old_size as i64;
// Remove enty from relation size cache
@@ -822,14 +917,14 @@ impl<'a> DatadirModification<'a> {
kind: SlruKind,
segno: u32,
nblocks: BlockNumber,
) -> Result<()> {
) -> anyhow::Result<()> {
// Add it to the directory entry
let dir_key = slru_dir_to_key(kind);
let buf = self.get(dir_key)?;
let buf = self.get(dir_key).no_ondemand_download()?;
let mut dir = SlruSegmentDirectory::des(&buf)?;
if !dir.segments.insert(segno) {
bail!("slru segment {:?}/{} already exists", kind, segno);
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
}
self.put(
dir_key,
@@ -852,7 +947,7 @@ impl<'a> DatadirModification<'a> {
kind: SlruKind,
segno: u32,
nblocks: BlockNumber,
) -> Result<()> {
) -> anyhow::Result<()> {
// Put size
let size_key = slru_segment_size_to_key(kind, segno);
let buf = nblocks.to_le_bytes();
@@ -861,10 +956,10 @@ impl<'a> DatadirModification<'a> {
}
/// This method is used for marking truncated SLRU files
pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> {
pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
// Remove it from the directory entry
let dir_key = slru_dir_to_key(kind);
let buf = self.get(dir_key)?;
let buf = self.get(dir_key).no_ondemand_download()?;
let mut dir = SlruSegmentDirectory::des(&buf)?;
if !dir.segments.remove(&segno) {
@@ -882,15 +977,15 @@ impl<'a> DatadirModification<'a> {
}
/// Drop a relmapper file (pg_filenode.map)
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> {
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
// TODO
Ok(())
}
/// This method is used for marking truncated SLRU files
pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> {
pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
// Remove it from the directory entry
let buf = self.get(TWOPHASEDIR_KEY)?;
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.remove(&xid) {
@@ -925,7 +1020,7 @@ impl<'a> DatadirModification<'a> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub fn flush(&mut self) -> Result<()> {
pub fn flush(&mut self) -> anyhow::Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -936,7 +1031,7 @@ impl<'a> DatadirModification<'a> {
let writer = self.tline.writer();
// Flush relation and SLRU data blocks, keep metadata.
let mut result: Result<()> = Ok(());
let mut result: anyhow::Result<()> = Ok(());
self.pending_updates.retain(|&key, value| {
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
result = writer.put(key, self.lsn, value);
@@ -984,7 +1079,7 @@ impl<'a> DatadirModification<'a> {
// Internal helper functions to batch the modifications
fn get(&self, key: Key) -> Result<Bytes> {
fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
// Have we already updated the same key? Read the pending updated
// version in that case.
//
@@ -992,14 +1087,14 @@ impl<'a> DatadirModification<'a> {
// value that has been removed, deletion only avoids leaking storage.
if let Some(value) = self.pending_updates.get(&key) {
if let Value::Image(img) = value {
Ok(img.clone())
PageReconstructResult::Success(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
bail!("unexpected pending WAL record");
PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
}
} else {
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
@@ -1327,7 +1422,7 @@ fn twophase_key_range(xid: TransactionId) -> Range<Key> {
field2: 0,
field3: 0,
field4: 0,
field5: if overflowed { 1 } else { 0 },
field5: u8::from(overflowed),
field6: next_xid,
}
}
@@ -1354,7 +1449,7 @@ const CHECKPOINT_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (
RelTag {
@@ -1365,7 +1460,7 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
},
key.field6,
),
_ => bail!("unexpected value kind 0x{:02x}", key.field1),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
@@ -1384,21 +1479,21 @@ pub fn is_rel_vm_block_key(key: Key) -> bool {
&& key.field6 != 0xffffffff
}
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
Ok(match key.field1 {
0x01 => {
let kind = match key.field2 {
0x00 => SlruKind::Clog,
0x01 => SlruKind::MultiXactMembers,
0x02 => SlruKind::MultiXactOffsets,
_ => bail!("unrecognized slru kind 0x{:02x}", key.field2),
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
};
let segno = key.field4;
let blknum = key.field6;
(kind, segno, blknum)
}
_ => bail!("unexpected value kind 0x{:02x}", key.field1),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
})
}
@@ -1413,7 +1508,7 @@ pub fn create_test_timeline(
tenant: &crate::tenant::Tenant,
timeline_id: utils::id::TimelineId,
pg_version: u32,
) -> Result<std::sync::Arc<Timeline>> {
) -> anyhow::Result<std::sync::Arc<Timeline>> {
let tline = tenant
.create_empty_timeline(timeline_id, Lsn(8), pg_version)?
.initialize()?;

View File

@@ -1,38 +0,0 @@
//! Helper functions to delete files from remote storage with a RemoteStorage
use anyhow::Context;
use std::path::Path;
use tracing::debug;
use remote_storage::GenericRemoteStorage;
pub(super) async fn delete_layer(
storage: &GenericRemoteStorage,
local_layer_path: &Path,
) -> anyhow::Result<()> {
fail::fail_point!("before-delete-layer", |_| {
anyhow::bail!("failpoint before-delete-layer")
});
debug!(
"Deleting layer from remote storage: {:?}",
local_layer_path.display()
);
let storage_path = storage
.remote_object_id(local_layer_path)
.with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
local_layer_path.display()
)
})?;
// XXX: If the deletion fails because the object already didn't exist,
// it would be good to just issue a warning but consider it success.
// https://github.com/neondatabase/neon/issues/2934
storage.delete(&storage_path).await.with_context(|| {
format!(
"Failed to delete remote layer from storage at '{:?}'",
storage_path
)
})
}

View File

@@ -1,257 +0,0 @@
//! Helper functions to download files from remote storage with a RemoteStorage
use std::collections::HashSet;
use std::path::Path;
use anyhow::{bail, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tracing::debug;
use crate::config::PageServerConf;
use crate::storage_sync::index::LayerFileMetadata;
use remote_storage::{DownloadError, GenericRemoteStorage};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use super::index::IndexPart;
use super::RelativePath;
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
fs::File::open(path).await?.sync_all().await
}
///
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
///
/// Returns the size of the downloaded file.
pub async fn download_layer_file<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
tenant_id: TenantId,
timeline_id: TimelineId,
path: &'a RelativePath,
layer_metadata: &'a LayerFileMetadata,
) -> anyhow::Result<u64> {
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
let local_path = path.to_local_path(&timeline_path);
let layer_storage_path = storage.remote_object_id(&local_path).with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
local_path.display()
)
})?;
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
// write(tmp)
// fsync(tmp)
// rename(tmp, new)
// fsync(new)
// fsync(parent)
// For more context about durable_rename check this email from postgres mailing list:
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
// TODO: this doesn't use the cached fd for some reason?
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
format!(
"Failed to create a destination file for layer '{}'",
temp_file_path.display()
)
})?;
let mut download = storage
.download(&layer_storage_path)
.await
.with_context(|| {
format!(
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
)
})?;
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
format!(
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
)
})?;
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
// A file will not be closed immediately when it goes out of scope if there are any IO operations
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
// you should call flush before dropping it.
//
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
// But for additional safety lets check/wait for any pending operations.
destination_file.flush().await.with_context(|| {
format!(
"failed to flush source file at {}",
temp_file_path.display()
)
})?;
match layer_metadata.file_size() {
Some(expected) if expected != bytes_amount => {
anyhow::bail!(
"According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
temp_file_path.display()
);
}
Some(_) | None => {
// matches, or upgrading from an earlier IndexPart version
}
}
// not using sync_data because it can lose file size update
destination_file.sync_all().await.with_context(|| {
format!(
"failed to fsync source file at {}",
temp_file_path.display()
)
})?;
drop(destination_file);
fail::fail_point!("remote-storage-download-pre-rename", |_| {
bail!("remote-storage-download-pre-rename failpoint triggered")
});
fs::rename(&temp_file_path, &local_path).await?;
fsync_path(&local_path)
.await
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?;
tracing::info!("download complete: {}", local_path.display());
Ok(bytes_amount)
}
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
pub fn is_temp_download_file(path: &Path) -> bool {
let extension = path.extension().map(|pname| {
pname
.to_str()
.expect("paths passed to this function must be valid Rust strings")
});
match extension {
Some(TEMP_DOWNLOAD_EXTENSION) => true,
Some(_) => false,
None => false,
}
}
/// List timelines of given tenant in remote storage
pub async fn list_remote_timelines<'a>(
storage: &'a GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
let tenant_path = conf.timelines_path(&tenant_id);
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
format!(
"Failed to get tenant storage path for local path '{}'",
tenant_path.display()
)
})?;
let timelines = storage
.list_prefixes(Some(&tenant_storage_path))
.await
.with_context(|| {
format!(
"Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
)
})?;
if timelines.is_empty() {
anyhow::bail!("no timelines found on the remote storage")
}
let mut timeline_ids = HashSet::new();
let mut part_downloads = FuturesUnordered::new();
for timeline_remote_storage_key in timelines {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?;
let timeline_id: TimelineId = object_name.parse().with_context(|| {
format!("failed to parse object name into timeline id '{object_name}'")
})?;
// list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
// yet, launch a download task for it.
if !timeline_ids.contains(&timeline_id) {
timeline_ids.insert(timeline_id);
let storage_clone = storage.clone();
part_downloads.push(async move {
(
timeline_id,
download_index_part(conf, &storage_clone, tenant_id, timeline_id).await,
)
});
}
}
// Wait for all the download tasks to complete.
let mut timeline_parts = Vec::new();
while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
let index_part = part_upload_result
.with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
debug!("Successfully fetched index part for timeline {timeline_id}");
timeline_parts.push((timeline_id, index_part));
}
Ok(timeline_parts)
}
pub async fn download_index_part(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<IndexPart, DownloadError> {
let index_part_path = conf
.metadata_path(timeline_id, tenant_id)
.with_file_name(IndexPart::FILE_NAME);
let part_storage_path = storage
.remote_object_id(&index_part_path)
.with_context(|| {
format!(
"Failed to get the index part storage path for local path '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::BadInput)?;
let mut index_part_download = storage.download(&part_storage_path).await?;
let mut index_part_bytes = Vec::new();
tokio::io::copy(
&mut index_part_download.download_stream,
&mut index_part_bytes,
)
.await
.with_context(|| {
format!(
"Failed to download an index part into file '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::Other)?;
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
.with_context(|| {
format!(
"Failed to deserialize index part file into file '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::Other)?;
Ok(index_part)
}

View File

@@ -25,7 +25,6 @@
//! the current task has been requested to shut down. You can use that with
//! Tokio select!().
//!
//!
//! TODO: This would be a good place to also handle panics in a somewhat sane way.
//! Depending on what task panics, we might want to kill the whole server, or
//! only a single tenant or timeline.
@@ -36,6 +35,7 @@
#![allow(clippy::declare_interior_mutable_const)]
use std::collections::HashMap;
use std::fmt;
use std::future::Future;
use std::panic::AssertUnwindSafe;
use std::sync::atomic::{AtomicU64, Ordering};
@@ -43,9 +43,9 @@ use std::sync::{Arc, Mutex};
use futures::FutureExt;
use tokio::runtime::Runtime;
use tokio::sync::watch;
use tokio::task::JoinHandle;
use tokio::task_local;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, warn};
@@ -71,7 +71,7 @@ use crate::shutdown_pageserver;
//
// WAL receiver runtime:
// - used to handle WAL receiver connections.
// - and to receiver updates from etcd
// - and to receiver updates from storage_broker
//
// Background runtime
// - layer flushing
@@ -135,22 +135,28 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
.expect("Failed to create background op runtime")
});
#[derive(Debug, Clone, Copy)]
pub struct PageserverTaskId(u64);
impl fmt::Display for PageserverTaskId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
}
}
/// Each task that we track is associated with a "task ID". It's just an
/// increasing number that we assign. Note that it is different from tokio::task::Id.
static NEXT_TASK_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
/// Global registry of tasks
static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
Lazy::new(|| Mutex::new(HashMap::new()));
task_local! {
// There is a Tokio watch channel for each task, which can be used to signal the
// task that it needs to shut down. This task local variable holds the receiving
// end of the channel. The sender is kept in the global registry, so that anyone
// can send the signal to request task shutdown.
static SHUTDOWN_RX: watch::Receiver<bool>;
// This is a cancellation token which will be cancelled when a task needs to shut down. The
// root token is kept in the global registry, so that anyone can send the signal to request
// task shutdown.
static SHUTDOWN_TOKEN: CancellationToken;
// Each task holds reference to its own PageServerTask here.
static CURRENT_TASK: Arc<PageServerTask>;
@@ -178,7 +184,7 @@ pub enum TaskKind {
PageRequestHandler,
// Manages the WAL receiver connection for one timeline. It subscribes to
// events from etcd, decides which safekeeper to connect to. It spawns a
// events from storage_broker, decides which safekeeper to connect to. It spawns a
// separate WalReceiverConnection task to handle each connection.
WalReceiverManager,
@@ -200,11 +206,20 @@ pub enum TaskKind {
// Task that uploads a file to remote storage
RemoteUploadTask,
// Task that downloads a file from remote storage
RemoteDownloadTask,
// task that handles the initial downloading of all tenants
InitialLoad,
// task that handles attaching a tenant
Attach,
// task that handhes metrics collection
MetricsCollection,
// task that drives downloading layers
DownloadAllRemoteLayers,
}
#[derive(Default)]
@@ -226,8 +241,8 @@ struct PageServerTask {
name: String,
// To request task shutdown, send 'true' to the channel to notify the task.
shutdown_tx: watch::Sender<bool>,
// To request task shutdown, just cancel this token.
cancel: CancellationToken,
mutable: Mutex<MutableTaskState>,
}
@@ -247,13 +262,13 @@ pub fn spawn<F>(
where
F: Future<Output = anyhow::Result<()>> + Send + 'static,
{
let (shutdown_tx, shutdown_rx) = watch::channel(false);
let cancel = CancellationToken::new();
let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
let task = Arc::new(PageServerTask {
task_id: PageserverTaskId(task_id),
kind,
name: name.to_string(),
shutdown_tx,
cancel: cancel.clone(),
mutable: Mutex::new(MutableTaskState {
tenant_id,
timeline_id,
@@ -271,7 +286,7 @@ where
task_name,
task_id,
task_cloned,
shutdown_rx,
cancel,
shutdown_process_on_error,
future,
));
@@ -288,7 +303,7 @@ async fn task_wrapper<F>(
task_name: String,
task_id: u64,
task: Arc<PageServerTask>,
shutdown_rx: watch::Receiver<bool>,
shutdown_token: CancellationToken,
shutdown_process_on_error: bool,
future: F,
) where
@@ -296,9 +311,9 @@ async fn task_wrapper<F>(
{
debug!("Starting task '{}'", task_name);
let result = SHUTDOWN_RX
let result = SHUTDOWN_TOKEN
.scope(
shutdown_rx,
shutdown_token,
CURRENT_TASK.scope(task, {
// We use AssertUnwindSafe here so that the payload function
// doesn't need to be UnwindSafe. We don't do anything after the
@@ -408,7 +423,7 @@ pub async fn shutdown_tasks(
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
{
let _ = task.shutdown_tx.send_replace(true);
task.cancel.cancel();
victim_tasks.push(Arc::clone(task));
}
}
@@ -436,24 +451,35 @@ pub fn current_task_kind() -> Option<TaskKind> {
CURRENT_TASK.try_with(|ct| ct.kind).ok()
}
pub fn current_task_id() -> Option<PageserverTaskId> {
CURRENT_TASK.try_with(|ct| ct.task_id).ok()
}
/// A Future that can be used to check if the current task has been requested to
/// shut down.
pub async fn shutdown_watcher() {
let mut shutdown_rx = SHUTDOWN_RX
.try_with(|rx| rx.clone())
let token = SHUTDOWN_TOKEN
.try_with(|t| t.clone())
.expect("shutdown_requested() called in an unexpected task or thread");
while !*shutdown_rx.borrow() {
if shutdown_rx.changed().await.is_err() {
break;
}
}
token.cancelled().await;
}
/// Clone the current task's cancellation token, which can be moved across tasks.
///
/// When the task which is currently executing is shutdown, the cancellation token will be
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
/// `tokio::task::JoinSet::spawn`.
pub fn shutdown_token() -> CancellationToken {
SHUTDOWN_TOKEN
.try_with(|t| t.clone())
.expect("shutdown_token() called in an unexpected task or thread")
}
/// Has the current task been requested to shut down?
pub fn is_shutdown_requested() -> bool {
if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) {
*shutdown_rx.borrow()
if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
cancel.is_cancelled()
} else {
if !cfg!(test) {
warn!("is_shutdown_requested() called in an unexpected task or thread");

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,6 @@
use crate::page_cache;
use crate::page_cache::{ReadBufResult, PAGE_SZ};
use bytes::Bytes;
use once_cell::sync::Lazy;
use std::ops::{Deref, DerefMut};
use std::os::unix::fs::FileExt;
use std::sync::atomic::AtomicU64;
@@ -61,7 +60,7 @@ where
///
/// ```no_run
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
/// # let reader: FileBlockReader<std::fs::File> = todo!();
/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
/// let cursor = reader.block_cursor();
/// let buf = cursor.read_blk(1);
/// // do stuff with 'buf'
@@ -117,7 +116,7 @@ where
}
}
static NEXT_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
/// An adapter for reading a (virtual) file using the page cache.
///

View File

@@ -185,14 +185,16 @@ impl TenantConfOpt {
if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = other.trace_read_requests {
self.trace_read_requests = Some(trace_read_requests);
}
}
}
impl TenantConf {
pub fn default() -> TenantConf {
impl Default for TenantConf {
fn default() -> Self {
use defaults::*;
TenantConf {
Self {
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
.expect("cannot parse default checkpoint timeout"),
@@ -217,29 +219,4 @@ impl TenantConf {
trace_read_requests: false,
}
}
pub fn dummy_conf() -> Self {
TenantConf {
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
checkpoint_timeout: Duration::from_secs(600),
compaction_target_size: 4 * 1024 * 1024,
compaction_period: Duration::from_secs(10),
compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
gc_horizon: defaults::DEFAULT_GC_HORIZON,
gc_period: Duration::from_secs(10),
image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
pitr_interval: Duration::from_secs(60 * 60),
walreceiver_connect_timeout: humantime::parse_duration(
defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
)
.unwrap(),
lagging_wal_timeout: humantime::parse_duration(
defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT,
)
.unwrap(),
max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.unwrap(),
trace_read_requests: false,
}
}
}

View File

@@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
off += keys_len as u64;
let values_off = off as usize;
let values_len = num_children as usize * VALUE_SZ as usize;
let values_len = num_children as usize * VALUE_SZ;
//off += values_len as u64;
let prefix = &buf[prefix_off..prefix_off + prefix_len as usize];
@@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
while low < high {
let mid = low + size / 2;
let key_off = mid as usize * self.suffix_len as usize;
let key_off = mid * self.suffix_len as usize;
let suffix = &self.keys[key_off..key_off + self.suffix_len as usize];
// Does this match?
keybuf[self.prefix_len as usize..].copy_from_slice(suffix);
@@ -328,7 +328,7 @@ where
while idx < node.num_children as usize {
let suffix = &node.keys[key_off..key_off + suffix_len];
keybuf[prefix_len..].copy_from_slice(suffix);
let value = node.value(idx as usize);
let value = node.value(idx);
#[allow(clippy::collapsible_if)]
if node.level == 0 {
// leaf
@@ -368,7 +368,7 @@ where
key_off -= suffix_len;
let suffix = &node.keys[key_off..key_off + suffix_len];
keybuf[prefix_len..].copy_from_slice(suffix);
let value = node.value(idx as usize);
let value = node.value(idx);
#[allow(clippy::collapsible_if)]
if node.level == 0 {
// leaf
@@ -629,7 +629,7 @@ impl<const L: usize> BuildNode<L> {
self.keys.extend(&key[self.prefix.len()..]);
self.values.extend(value.0);
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
self.size += self.suffix_len + VALUE_SZ;
@@ -674,7 +674,7 @@ impl<const L: usize> BuildNode<L> {
self.size -= prefix_len * self.num_children as usize;
self.size += prefix_len;
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
true
@@ -684,7 +684,7 @@ impl<const L: usize> BuildNode<L> {
/// Serialize the node to on-disk format.
///
fn pack(&self) -> Bytes {
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
assert!(self.num_children > 0);
@@ -940,7 +940,7 @@ mod tests {
let t = -(f64::ln(u));
let key_int = (t * 1000000.0) as u128;
all_data.insert(key_int as u128, idx as u64);
all_data.insert(key_int, idx as u64);
}
// Build a tree from it

View File

@@ -91,7 +91,7 @@ impl EphemeralFile {
break;
}
off += n as usize;
off += n;
}
Ok(())
}

View File

@@ -12,7 +12,7 @@
use crate::metrics::NUM_ONDISK_LAYERS;
use crate::repository::Key;
use crate::tenant::inmemory_layer::InMemoryLayer;
use crate::tenant::storage_layer::InMemoryLayer;
use crate::tenant::storage_layer::Layer;
use anyhow::Result;
use std::collections::VecDeque;
@@ -25,8 +25,7 @@ use super::bst_layer_map::RetroactiveLayerMap;
///
/// LayerMap tracks what layers exist on a timeline.
///
#[derive(Default)]
pub struct LayerMap {
pub struct LayerMap<L: ?Sized> {
//
// 'open_layer' holds the current InMemoryLayer that is accepting new
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
@@ -47,20 +46,35 @@ pub struct LayerMap {
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
/// Index of the historic layers optimized for search
index: RetroactiveLayerMap<Arc<dyn Layer>>,
index: RetroactiveLayerMap<Arc<L>>,
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
l0_delta_layers: Vec<Arc<dyn Layer>>,
l0_delta_layers: Vec<Arc<L>>,
}
impl<L: ?Sized> Default for LayerMap<L> {
fn default() -> Self {
Self {
open_layer: None,
next_open_layer_at: None,
frozen_layers: VecDeque::default(),
l0_delta_layers: Vec::default(),
index: RetroactiveLayerMap::default(),
}
}
}
/// Return value of LayerMap::search
pub struct SearchResult {
pub layer: Arc<dyn Layer>,
pub struct SearchResult<L: ?Sized> {
pub layer: Arc<L>,
pub lsn_floor: Lsn,
}
impl LayerMap {
impl<L> LayerMap<L>
where
L: ?Sized + Layer,
{
///
/// Find the latest layer that covers the given 'key', with lsn <
/// 'end_lsn'.
@@ -72,39 +86,39 @@ impl LayerMap {
/// contain the version, even if it's missing from the returned
/// layer.
///
pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult>> {
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
match self.index.query(key.to_i128(), end_lsn.0 - 1) {
(None, None) => Ok(None),
(None, None) => None,
(None, Some(image)) => {
let lsn_floor = image.get_lsn_range().start;
Ok(Some(SearchResult {
Some(SearchResult {
layer: image,
lsn_floor,
}))
})
}
(Some(delta), None) => {
let lsn_floor = delta.get_lsn_range().start;
Ok(Some(SearchResult {
Some(SearchResult {
layer: delta,
lsn_floor,
}))
})
}
(Some(delta), Some(image)) => {
let img_lsn = image.get_lsn_range().start;
let image_is_newer = image.get_lsn_range().end > delta.get_lsn_range().end;
let image_exact_match = Lsn(img_lsn.0 + 1) == end_lsn;
if image_is_newer || image_exact_match {
Ok(Some(SearchResult {
Some(SearchResult {
layer: image,
lsn_floor: img_lsn,
}))
})
} else {
let lsn_floor =
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
Ok(Some(SearchResult {
Some(SearchResult {
layer: delta,
lsn_floor,
}))
})
}
}
}
@@ -113,7 +127,7 @@ impl LayerMap {
///
/// Insert an on-disk layer
///
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
pub fn insert_historic(&mut self, layer: Arc<L>) {
let kr = layer.get_key_range();
let lr = layer.get_lsn_range();
self.index.insert(
@@ -140,7 +154,7 @@ impl LayerMap {
///
/// This should be called when the corresponding file on disk has been deleted.
///
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
pub fn remove_historic(&mut self, layer: Arc<L>) {
let kr = layer.get_key_range();
let lr = layer.get_lsn_range();
self.index.remove(
@@ -182,7 +196,7 @@ impl LayerMap {
let start = key.start.to_i128();
let end = key.end.to_i128();
let layer_covers = |layer: Option<Arc<dyn Layer>>| match layer {
let layer_covers = |layer: Option<Arc<L>>| match layer {
Some(layer) => layer.get_lsn_range().start >= lsn.start,
None => false,
};
@@ -202,7 +216,7 @@ impl LayerMap {
return Ok(true);
}
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
self.index.iter()
}
@@ -218,7 +232,7 @@ impl LayerMap {
&self,
key_range: &Range<Key>,
lsn: Lsn,
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
let version = match self.index.get_version(lsn.0 - 1) {
Some(v) => v,
None => return Ok(vec![]),
@@ -228,7 +242,7 @@ impl LayerMap {
let end = key_range.end.to_i128();
// Initialize loop variables
let mut coverage: Vec<(Range<Key>, Option<Arc<dyn Layer>>)> = vec![];
let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
let mut current_key = start.clone();
let mut current_val = version.query(start).1;
@@ -308,7 +322,7 @@ impl LayerMap {
}
/// Return all L0 delta layers
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
Ok(self.l0_delta_layers.clone())
}

View File

@@ -255,8 +255,7 @@ pub fn save_metadata(
// fsync the parent directory to ensure the directory entry is durable
if first_save {
let timeline_dir = File::open(
&path
.parent()
path.parent()
.expect("Metadata should always have a parent dir"),
)?;
timeline_dir.sync_all()?;

View File

@@ -0,0 +1,494 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use std::collections::{hash_map, HashMap};
use std::ffi::OsStr;
use std::path::Path;
use std::sync::Arc;
use tokio::fs;
use anyhow::Context;
use once_cell::sync::Lazy;
use tokio::sync::RwLock;
use tracing::*;
use remote_storage::GenericRemoteStorage;
use utils::crashsafe;
use crate::config::PageServerConf;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{Tenant, TenantState};
use crate::IGNORED_TENANT_FILE_NAME;
use utils::fs_ext::PathExt;
use utils::id::{TenantId, TimelineId};
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
Lazy::new(|| RwLock::new(HashMap::new()));
/// Initialize repositories with locally available timelines.
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
/// are scheduled for download and added to the tenant once download is completed.
#[instrument(skip(conf, remote_storage))]
pub async fn init_tenant_mgr(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<()> {
// Scan local filesystem for attached tenants
let mut number_of_tenants = 0;
let tenants_dir = conf.tenants_path();
let mut dir_entries = fs::read_dir(&tenants_dir)
.await
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
loop {
match dir_entries.next_entry().await {
Ok(None) => break,
Ok(Some(dir_entry)) => {
let tenant_dir_path = dir_entry.path();
if crate::is_temporary(&tenant_dir_path) {
info!(
"Found temporary tenant directory, removing: {}",
tenant_dir_path.display()
);
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
error!(
"Failed to remove temporary directory '{}': {:?}",
tenant_dir_path.display(),
e
);
}
} else {
// This case happens if we crash during attach before creating the attach marker file
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
})?;
if is_empty {
info!("removing empty tenant directory {tenant_dir_path:?}");
if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path.display()
)
}
continue;
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
continue;
}
match schedule_local_tenant_processing(
conf,
&tenant_dir_path,
remote_storage.clone(),
) {
Ok(tenant) => {
TENANTS.write().await.insert(tenant.tenant_id(), tenant);
number_of_tenants += 1;
}
Err(e) => {
error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
}
}
}
}
Err(e) => {
// On error, print it, but continue with the other tenants. If we error out
// here, the pageserver startup fails altogether, causing outage for *all*
// tenants. That seems worse.
error!(
"Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
);
}
}
}
info!("Processed {number_of_tenants} local tenants at startup");
Ok(())
}
pub fn schedule_local_tenant_processing(
conf: &'static PageServerConf,
tenant_path: &Path,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<Arc<Tenant>> {
anyhow::ensure!(
tenant_path.is_dir(),
"Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
);
anyhow::ensure!(
!crate::is_temporary(tenant_path),
"Cannot load tenant from temporary path {tenant_path:?}"
);
anyhow::ensure!(
!tenant_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_path:?} is an empty dir")
})?,
"Cannot load tenant from empty directory {tenant_path:?}"
);
let tenant_id = tenant_path
.file_name()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TenantId>()
.with_context(|| {
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
})?;
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
anyhow::ensure!(
!conf.tenant_ignore_mark_file_path(tenant_id).exists(),
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
);
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if let Some(remote_storage) = remote_storage {
Tenant::spawn_attach(conf, tenant_id, remote_storage)
} else {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(conf, tenant_id)
}
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
// Start loading the tenant into memory. It will initially be in Loading state.
Tenant::spawn_load(conf, tenant_id, remote_storage)
};
Ok(tenant)
}
///
/// Shut down all tenants. This runs as part of pageserver shutdown.
///
pub async fn shutdown_all_tenants() {
let tenants_to_shut_down = {
let mut m = TENANTS.write().await;
let mut tenants_to_shut_down = Vec::with_capacity(m.len());
for (_, tenant) in m.drain() {
if tenant.is_active() {
// updates tenant state, forbidding new GC and compaction iterations from starting
tenant.set_stopping();
tenants_to_shut_down.push(tenant)
}
}
drop(m);
tenants_to_shut_down
};
// Shut down all existing walreceiver connections and stop accepting the new ones.
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
// Ok, no background tasks running anymore. Flush any remaining data in
// memory to disk.
//
// We assume that any incoming connections that might request pages from
// the tenant have already been terminated by the caller, so there
// should be no more activity in any of the repositories.
//
// On error, log it but continue with the shutdown for other tenants.
for tenant in tenants_to_shut_down {
let tenant_id = tenant.tenant_id();
debug!("shutdown tenant {tenant_id}");
if let Err(err) = tenant.freeze_and_flush().await {
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
}
}
}
pub async fn create_tenant(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<Option<Arc<Tenant>>> {
match TENANTS.write().await.entry(tenant_id) {
hash_map::Entry::Occupied(_) => {
debug!("tenant {tenant_id} already exists");
Ok(None)
}
hash_map::Entry::Vacant(v) => {
// Hold the write_tenants() lock, since all of this is local IO.
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
let created_tenant =
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
let crated_tenant_id = created_tenant.tenant_id();
anyhow::ensure!(
tenant_id == crated_tenant_id,
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
);
v.insert(Arc::clone(&created_tenant));
Ok(Some(created_tenant))
}
}
}
pub async fn update_tenant_config(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> anyhow::Result<()> {
info!("configuring tenant {tenant_id}");
get_tenant(tenant_id, true)
.await?
.update_tenant_config(tenant_conf);
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
Ok(())
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
let m = TENANTS.read().await;
let tenant = m
.get(&tenant_id)
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
if active_only && !tenant.is_active() {
anyhow::bail!(
"Tenant {tenant_id} is not active. Current state: {:?}",
tenant.current_state()
)
} else {
Ok(Arc::clone(tenant))
}
}
pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
match get_tenant(tenant_id, true).await {
Ok(tenant) => {
tenant.delete_timeline(timeline_id).await?;
}
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
}
Ok(())
}
pub async fn detach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<()> {
remove_tenant_from_memory(tenant_id, async {
let local_tenant_directory = conf.tenant_path(&tenant_id);
fs::remove_dir_all(&local_tenant_directory)
.await
.with_context(|| {
format!("Failed to remove local tenant directory {local_tenant_directory:?}")
})?;
Ok(())
})
.await
}
pub async fn load_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<()> {
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
let tenant_path = conf.tenant_path(&tenant_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
if tenant_ignore_mark.exists() {
std::fs::remove_file(&tenant_ignore_mark)
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
}
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
.with_context(|| {
format!("Failed to schedule tenant processing in path {tenant_path:?}")
})?;
vacant_entry.insert(new_tenant);
Ok(())
}).await
}
pub async fn ignore_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<()> {
remove_tenant_from_memory(tenant_id, async {
let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
fs::File::create(&ignore_mark_file)
.await
.context("Failed to create ignore mark file")
.and_then(|_| {
crashsafe::fsync_file_and_parent(&ignore_mark_file)
.context("Failed to fsync ignore mark file")
})
.with_context(|| format!("Failed to crate ignore mark for tenant {tenant_id}"))?;
Ok(())
})
.await
}
///
/// Get list of tenants, for the mgmt API
///
pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
TENANTS
.read()
.await
.iter()
.map(|(id, tenant)| (*id, tenant.current_state()))
.collect()
}
/// Execute Attach mgmt API command.
///
/// Downloading all the tenant data is performed in the background, this merely
/// spawns the background task and returns quickly.
pub async fn attach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
remote_storage: GenericRemoteStorage,
) -> anyhow::Result<()> {
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
let tenant_path = conf.tenant_path(&tenant_id);
anyhow::ensure!(
!tenant_path.exists(),
"Cannot attach tenant {tenant_id}, local tenant directory already exists"
);
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
vacant_entry.insert(tenant);
Ok(())
})
.await
}
async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
where
F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
{
match TENANTS.write().await.entry(tenant_id) {
hash_map::Entry::Occupied(e) => {
anyhow::bail!(
"tenant {tenant_id} already exists, state: {:?}",
e.get().current_state()
)
}
hash_map::Entry::Vacant(v) => run(v),
}
}
/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
/// Allows to remove other tenant resources manually, via `tenant_cleanup`.
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
/// operation would be needed to remove it.
async fn remove_tenant_from_memory<V, F>(
tenant_id: TenantId,
tenant_cleanup: F,
) -> anyhow::Result<V>
where
F: std::future::Future<Output = anyhow::Result<V>>,
{
// It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
// avoid holding the lock for the entire process.
{
let tenants_accessor = TENANTS.write().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => match tenant.current_state() {
TenantState::Attaching
| TenantState::Loading
| TenantState::Broken
| TenantState::Active => tenant.set_stopping(),
TenantState::Stopping => {
anyhow::bail!("Tenant {tenant_id} is stopping already")
}
},
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
}
}
// shutdown all tenant and timeline tasks: gc, compaction, page service)
// No new tasks will be started for this tenant because it's in `Stopping` state.
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
match tenant_cleanup
.await
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
{
Ok(hook_value) => {
let mut tenants_accessor = TENANTS.write().await;
if tenants_accessor.remove(&tenant_id).is_none() {
warn!("Tenant {tenant_id} got removed from memory before operation finished");
}
Ok(hook_value)
}
Err(e) => {
let tenants_accessor = TENANTS.read().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => tenant.set_broken(),
None => warn!("Tenant {tenant_id} got removed from memory"),
}
Err(e)
}
}
}
#[cfg(feature = "testing")]
use {
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
utils::http::error::ApiError,
};
#[cfg(feature = "testing")]
pub async fn immediate_gc(
tenant_id: TenantId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
let guard = TENANTS.read().await;
let tenant = guard
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("Tenant {tenant_id} not found"))
.map_err(ApiError::NotFound)?;
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
// Run in task_mgr to avoid race with detach operation
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::GarbageCollector,
Some(tenant_id),
Some(timeline_id),
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
false,
async move {
fail::fail_point!("immediate_gc_task_pre");
let result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr)
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
match task_done.send(result) {
Ok(_) => (),
Err(result) => error!("failed to send gc result: {result:?}"),
}
Ok(())
}
);
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
drop(guard);
Ok(wait_task_done)
}

View File

@@ -32,7 +32,8 @@
//! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]:
//!
//! - [`RemoteTimelineClient::schedule_layer_file_upload`] when we've created a new layer file.
//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file.
//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file.
//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads
//! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files.
//!
//! Internally, these functions create [`UploadOp`]s and put them in a queue.
@@ -57,7 +58,7 @@
//! To have a consistent remote structure, it's important that uploads and
//! deletions are performed in the right order. For example, the index file
//! contains a list of layer files, so it must not be uploaded until all the
//! layer files that are in its list have been succesfully uploaded.
//! layer files that are in its list have been successfully uploaded.
//!
//! The contract between client and its user is that the user is responsible of
//! scheduling operations in an order that keeps the remote consistent as
@@ -139,7 +140,7 @@
//! Note that if we crash during file deletion between the index update
//! that removes the file from the list of files, and deleting the remote file,
//! the file is leaked in the remote storage. Similarly, if a new file is created
//! and uploaded, but the pageserver dies permantently before updating the
//! and uploaded, but the pageserver dies permanently before updating the
//! remote index file, the new file is leaked in remote storage. We accept and
//! tolerate that for now.
//! Note further that we cannot easily fix this by scheduling deletes for every
@@ -147,31 +148,43 @@
//! following two cases:
//! - (1) We had the file locally, deleted it locally, scheduled a remote delete,
//! but crashed before it finished remotely.
//! - (2) We never had the file locally because we were still in tenant attach
//! when we crashed. (Similar case for on-demand download in the future.)
//! - (2) We never had the file locally because we haven't on-demand downloaded
//! it yet.
//!
//! # Downloads (= Tenant Attach)
//! # Downloads
//!
//! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
//! downloading files from the remote storage. Downloads are performed immediately,
//! independently of the uploads.
//! downloading files from the remote storage. Downloads are performed immediately
//! against the `RemoteStorage`, independently of the upload queue.
//!
//! When we attach a tenant, we perform the following steps:
//! - create `Tenant` object in `TenantState::Attaching` state
//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart`
//! - eagerly download all the remote layers using the client's download APIs
//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state.
//! - List timelines that are present in remote storage, and for each:
//! - download their remote [`IndexPart`]s
//! - create `Timeline` struct and a `RemoteTimelineClient`
//! - initialize the client's upload queue with its `IndexPart`
//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
//! but not present locally
//! - schedule uploads for layers that are only present locally.
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
//! the local filesystem, write the remote metadata to the local filesystem
//! - After the above is done for each timeline, open the tenant for business by
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
//!
//! Most of the above happens in [`Timeline::reconcile_with_remote`].
//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
//! We keep track of the fact that a client is in `Attaching` state in a marker
//! file on the local disk.
//! However, the distinction is moot for storage sync since we call
//! `reconcile_with_remote` for tenants both with and without the marker file.
//!
//! In the future, downloading will be done on-demand and `reconcile_with_remote`
//! will only be responsible for re-scheduling upload ops after a crash of an
//! `Active` tenant.
//! file on the local disk. This is critical because, when we restart the pageserver,
//! we do not want to do the `List timelines` step for each tenant that has already
//! been successfully attached (for performance & cost reasons).
//! Instead, for a tenant without the attach marker file, we assume that the
//! local state is in sync or ahead of the remote state. This includes the list
//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
//! if there's a timeline on the remote that the pageserver doesn't know about,
//! the GC will not consider its branch point, leading to data loss.
//! So, for a tenant with the attach marker file, we know that we do not yet have
//! persisted all the remote timeline's metadata files locally. To exclude the
//! risk above, we re-run the procedure for such tenants
//!
//! # Operating Without Remote Storage
//!
@@ -194,39 +207,51 @@ mod upload;
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;
use std::ops::DerefMut;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use anyhow::ensure;
use remote_storage::{DownloadError, GenericRemoteStorage};
use std::ops::DerefMut;
use tokio::runtime::Runtime;
use tracing::{error, info, warn};
use tracing::{debug, info, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
use self::index::IndexPart;
use crate::metrics::MeasureRemoteOp;
use crate::metrics::RemoteOpFileKind;
use crate::metrics::RemoteOpKind;
use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::{
config::PageServerConf,
storage_sync::index::{LayerFileMetadata, RelativePath},
task_mgr,
task_mgr::TaskKind,
task_mgr::BACKGROUND_RUNTIME,
tenant::metadata::TimelineMetadata,
tenant::upload_queue::{
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
},
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
};
use utils::id::{TenantId, TimelineId};
use self::index::IndexPart;
use super::storage_layer::LayerFileName;
// Occasional network issues and such can cause remote operations to fail, and
// that's expected. If a download fails, we log it at info-level, and retry.
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
// level instead, as repeated failures can mean a more serious problem. If it
// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
const FAILED_DOWNLOAD_RETRIES: u32 = 10;
// Similarly log failed uploads and deletions at WARN level, after this many
// retries. Uploads and deletions are retried forever, though.
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
/// A client for accessing a timeline's data in remote storage.
///
/// This takes care of managing the number of connections, and balancing them
@@ -256,207 +281,42 @@ pub struct RemoteTimelineClient {
upload_queue: Mutex<UploadQueue>,
metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
}
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
// memory for Uninitialized variants. Doesn't matter in practice, there are not
// that many upload queues in a running pageserver, and most of them are initialized
// anyway.
#[allow(clippy::large_enum_variant)]
enum UploadQueue {
Uninitialized,
Initialized(UploadQueueInitialized),
Stopped(UploadQueueStopped),
}
impl UploadQueue {
fn as_str(&self) -> &'static str {
match self {
UploadQueue::Uninitialized => "Uninitialized",
UploadQueue::Initialized(_) => "Initialized",
UploadQueue::Stopped(_) => "Stopped",
}
}
}
/// This keeps track of queued and in-progress tasks.
struct UploadQueueInitialized {
/// Counter to assign task IDs
task_counter: u64,
/// All layer files stored in the remote storage, taking into account all
/// in-progress and queued operations
latest_files: HashMap<RelativePath, LayerFileMetadata>,
/// Metadata stored in the remote storage, taking into account all
/// in-progress and queued operations.
/// DANGER: do not return to outside world, e.g., safekeepers.
latest_metadata: TimelineMetadata,
/// `disk_consistent_lsn` from the last metadata file that was successfully
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
/// Safekeeper can rely on it to make decisions for WAL storage.
last_uploaded_consistent_lsn: Lsn,
// Breakdown of different kinds of tasks currently in-progress
num_inprogress_layer_uploads: usize,
num_inprogress_metadata_uploads: usize,
num_inprogress_deletions: usize,
/// Tasks that are currently in-progress. In-progress means that a tokio Task
/// has been launched for it. An in-progress task can be busy uploading, but it can
/// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
/// be waiting for retry in `exponential_backoff`.
inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
/// Queued operations that have not been launched yet. They might depend on previous
/// tasks to finish. For example, metadata upload cannot be performed before all
/// preceding layer file uploads have completed.
queued_operations: VecDeque<UploadOp>,
}
struct UploadQueueStopped {
last_uploaded_consistent_lsn: Lsn,
}
impl UploadQueue {
fn initialize_empty_remote(
&mut self,
metadata: &TimelineMetadata,
) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized => (),
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
anyhow::bail!("already initialized, state {}", self.as_str())
}
}
info!("initializing upload queue for empty remote");
let state = UploadQueueInitialized {
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
latest_files: Default::default(),
latest_metadata: metadata.clone(),
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
// safekeepers from garbage-collecting anything.
last_uploaded_consistent_lsn: Lsn(0),
// what follows are boring default initializations
task_counter: Default::default(),
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: Default::default(),
queued_operations: Default::default(),
};
*self = UploadQueue::Initialized(state);
Ok(self.initialized_mut().expect("we just set it"))
}
fn initialize_with_current_remote_index_part(
&mut self,
index_part: &IndexPart,
) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized => (),
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
anyhow::bail!("already initialized, state {}", self.as_str())
}
}
let mut files = HashMap::new();
for path in &index_part.timeline_layers {
let layer_metadata = index_part
.layer_metadata
.get(path)
.map(LayerFileMetadata::from)
.unwrap_or(LayerFileMetadata::MISSING);
files.insert(path.clone(), layer_metadata);
}
let index_part_metadata = index_part.parse_metadata()?;
info!(
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
index_part_metadata.disk_consistent_lsn()
);
let state = UploadQueueInitialized {
latest_files: files,
latest_metadata: index_part_metadata.clone(),
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: Default::default(),
queued_operations: Default::default(),
};
*self = UploadQueue::Initialized(state);
Ok(self.initialized_mut().expect("we just set it"))
}
fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
anyhow::bail!("queue is in state {}", self.as_str())
}
UploadQueue::Initialized(x) => Ok(x),
}
}
}
/// An in-progress upload or delete task.
#[derive(Debug)]
struct UploadTask {
/// Unique ID of this task. Used as the key in `inprogress_tasks` above.
task_id: u64,
retries: AtomicU32,
op: UploadOp,
}
#[derive(Debug)]
enum UploadOp {
/// Upload a layer file
UploadLayer(PathBuf, LayerFileMetadata),
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
/// Delete a file.
Delete(RemoteOpFileKind, PathBuf),
/// Barrier. When the barrier operation is reached,
Barrier(tokio::sync::watch::Sender<()>),
}
impl std::fmt::Display for UploadOp {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
UploadOp::UploadLayer(path, metadata) => write!(
f,
"UploadLayer({}, size={:?})",
path.display(),
metadata.file_size()
),
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.display()),
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}
}
impl RemoteTimelineClient {
///
/// Create a remote storage client for given timeline
///
/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
/// by calling init_upload_queue.
///
pub fn new(
remote_storage: GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<RemoteTimelineClient> {
Ok(RemoteTimelineClient {
conf,
runtime: &BACKGROUND_RUNTIME,
tenant_id,
timeline_id,
storage_impl: remote_storage,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
})
}
/// Initialize the upload queue for a remote storage that already received
/// an index file upload, i.e., it's not empty.
/// The given `index_part` must be the one on the remote.
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
self.update_remote_physical_size_gauge(Some(index_part));
Ok(())
}
@@ -468,6 +328,7 @@ impl RemoteTimelineClient {
) -> anyhow::Result<()> {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_empty_remote(local_metadata)?;
self.update_remote_physical_size_gauge(None);
Ok(())
}
@@ -479,6 +340,24 @@ impl RemoteTimelineClient {
}
}
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
current_remote_index_part
.layer_metadata
.values()
// If we don't have the file size for the layer, don't account for it in the metric.
.map(|ilmd| ilmd.file_size.unwrap_or(0))
.sum()
} else {
0
};
self.metrics.remote_physical_size_gauge().set(size);
}
pub fn get_remote_physical_size(&self) -> u64 {
self.metrics.remote_physical_size_gauge().get()
}
//
// Download operations.
//
@@ -499,6 +378,7 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await
}
@@ -510,7 +390,7 @@ impl RemoteTimelineClient {
/// On success, returns the size of the downloaded file.
pub async fn download_layer_file(
&self,
path: &RelativePath,
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
) -> anyhow::Result<u64> {
let downloaded_size = download::download_layer_file(
@@ -518,7 +398,7 @@ impl RemoteTimelineClient {
&self.storage_impl,
self.tenant_id,
self.timeline_id,
path,
layer_file_name,
layer_metadata,
)
.measure_remote_op(
@@ -526,6 +406,7 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
Arc::clone(&self.metrics),
)
.await?;
@@ -536,13 +417,23 @@ impl RemoteTimelineClient {
let new_metadata = LayerFileMetadata::new(downloaded_size);
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if let Some(upgraded) = upload_queue.latest_files.get_mut(path) {
upgraded.merge(&new_metadata);
if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
if upgraded.merge(&new_metadata) {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
}
// If we don't do an index file upload inbetween here and restart,
// the value will go back down after pageserver restart, since we will
// have lost this data point.
// But, we upload index part fairly frequently, and restart pageserver rarely.
// So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
self.metrics
.remote_physical_size_gauge()
.add(downloaded_size);
} else {
// The file should exist, since we just downloaded it.
warn!(
"downloaded file {:?} not found in local copy of the index file",
path
layer_file_name
);
}
}
@@ -554,14 +445,20 @@ impl RemoteTimelineClient {
//
///
/// Launch an index-file upload operation in the background.
/// Launch an index-file upload operation in the background, with
/// updated metadata.
///
/// The upload will be added to the queue immediately, but it
/// won't be performed until all previosuly scheduled layer file
/// upload operations have completed successfully. This is to
/// ensure that when the index file claims that layers X, Y and Z
/// exist in remote storage, they really do.
pub fn schedule_index_upload(
/// exist in remote storage, they really do. To wait for the upload
/// to complete, use `wait_completion`.
///
/// If there were any changes to the list of files, i.e. if any
/// layer file uploads were scheduled, since the last index file
/// upload, those will be included too.
pub fn schedule_index_upload_for_metadata_update(
self: &Arc<Self>,
metadata: &TimelineMetadata,
) -> anyhow::Result<()> {
@@ -572,26 +469,60 @@ impl RemoteTimelineClient {
// ahead of what's _actually_ on the remote during index upload.
upload_queue.latest_metadata = metadata.clone();
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
self.schedule_index_upload(upload_queue, metadata_bytes);
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///
/// Use this function to schedule the update of the index file after
/// scheduling file uploads or deletions. If no file uploads or deletions
/// have been scheduled since the last index file upload, this does
/// nothing.
///
/// Like schedule_index_upload_for_metadata_update(), this merely adds
/// the upload to the upload queue and returns quickly.
pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
self.schedule_index_upload(upload_queue, metadata_bytes);
}
Ok(())
}
/// Launch an index-file upload operation in the background (internal function)
fn schedule_index_upload(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
metadata_bytes: Vec<u8>,
) {
info!(
"scheduling metadata upload with {} files ({} changed)",
upload_queue.latest_files.len(),
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
upload_queue.latest_metadata.to_bytes()?,
metadata_bytes,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.update_upload_queue_unfinished_metric(1, &op);
upload_queue.queued_operations.push_back(op);
info!(
"scheduled metadata upload with {} files",
upload_queue.latest_files.len()
);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
Ok(())
}
///
@@ -599,7 +530,7 @@ impl RemoteTimelineClient {
///
pub fn schedule_layer_file_upload(
self: &Arc<Self>,
path: &Path,
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
@@ -612,20 +543,19 @@ impl RemoteTimelineClient {
"file size not initialized in metadata"
);
let relative_path = RelativePath::from_local_path(
&self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
path,
)?;
upload_queue
.latest_files
.insert(relative_path, layer_metadata.clone());
.insert(layer_file_name.clone(), layer_metadata.clone());
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone());
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
self.update_upload_queue_unfinished_metric(1, &op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file upload {}", path.display());
info!(
"scheduled layer file upload {}",
layer_file_name.file_name()
);
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
@@ -635,25 +565,21 @@ impl RemoteTimelineClient {
///
/// Launch a delete operation in the background.
///
/// The deletion won't actually be performed, until all preceding
/// upload operations have completed succesfully.
pub fn schedule_layer_file_deletion(self: &Arc<Self>, paths: &[PathBuf]) -> anyhow::Result<()> {
/// Note: This schedules an index file upload before the deletions. The
/// deletion won't actually be performed, until any previously scheduled
/// upload operations, and the index file upload, have completed
/// succesfully.
///
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerFileName],
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
// Convert the paths into RelativePaths, and gather other information we need.
let mut relative_paths = Vec::with_capacity(paths.len());
for path in paths {
relative_paths.push(RelativePath::from_local_path(
&self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
path,
)?);
}
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
// Update the remote index file, removing the to-be-deleted files from the index,
// before deleting the actual files.
@@ -663,25 +589,21 @@ impl RemoteTimelineClient {
// from latest_files, but not yet scheduled for deletion. Use a closure
// to syntactically forbid ? or bail! calls here.
let no_bail_here = || {
for relative_path in relative_paths {
upload_queue.latest_files.remove(&relative_path);
for name in names {
upload_queue.latest_files.remove(name);
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
}
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata_bytes,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.update_upload_queue_unfinished_metric(1, &op);
upload_queue.queued_operations.push_back(op);
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue, metadata_bytes);
}
// schedule the actual deletions
for path in paths {
let op = UploadOp::Delete(RemoteOpFileKind::Layer, PathBuf::from(path));
for name in names {
let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
self.update_upload_queue_unfinished_metric(1, &op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file deletion {}", path.display());
info!("scheduled layer file deletion {}", name.file_name());
}
// Launch the tasks immediately, if possible
@@ -753,7 +675,7 @@ impl RemoteTimelineClient {
// We can launch this task. Remove it from the queue first.
let next_op = upload_queue.queued_operations.pop_front().unwrap();
info!("starting op: {}", next_op);
debug!("starting op: {}", next_op);
// Update the counters
match next_op {
@@ -837,18 +759,28 @@ impl RemoteTimelineClient {
}
let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref path, ref layer_metadata) => {
upload::upload_timeline_layer(&self.storage_impl, path, layer_metadata)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
)
.await
UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
let path = &self
.conf
.timeline_path(&self.timeline_id, &self.tenant_id)
.join(layer_file_name.file_name());
upload::upload_timeline_layer(
self.conf,
&self.storage_impl,
path,
layer_metadata,
)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
)
.await
}
UploadOp::UploadMetadata(ref index_part, _lsn) => {
upload::upload_index_part(
let res = upload::upload_index_part(
self.conf,
&self.storage_impl,
self.tenant_id,
@@ -860,16 +792,26 @@ impl RemoteTimelineClient {
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
Arc::clone(&self.metrics),
)
.await
.await;
if res.is_ok() {
self.update_remote_physical_size_gauge(Some(index_part));
}
res
}
UploadOp::Delete(metric_file_kind, ref path) => {
delete::delete_layer(&self.storage_impl, path)
UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
let path = &self
.conf
.timeline_path(&self.timeline_id, &self.tenant_id)
.join(layer_file_name.file_name());
delete::delete_layer(self.conf, &self.storage_impl, path)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
*metric_file_kind,
RemoteOpKind::Delete,
Arc::clone(&self.metrics),
)
.await
}
@@ -888,10 +830,22 @@ impl RemoteTimelineClient {
Err(e) => {
let retries = task.retries.fetch_add(1, Ordering::SeqCst);
error!(
"failed to perform remote task {}, will retry (attempt {}): {:?}",
task.op, retries, e
);
// Uploads can fail due to rate limits (IAM, S3), spurious network problems,
// or other external reasons. Such issues are relatively regular, so log them
// at info level at first, and only WARN if the operation fails repeatedly.
//
// (See similar logic for downloads in `download::download_retry`)
if retries < FAILED_UPLOAD_WARN_THRESHOLD {
info!(
"failed to perform remote task {}, will retry (attempt {}): {:#}",
task.op, retries, e
);
} else {
warn!(
"failed to perform remote task {}, will retry (attempt {}): {:?}",
task.op, retries, e
);
}
// sleep until it's time to retry, or we're cancelled
tokio::select! {
@@ -913,7 +867,7 @@ impl RemoteTimelineClient {
task.op, retries
);
} else {
info!("remote task {} completed successfully", task.op);
debug!("remote task {} completed successfully", task.op);
}
// The task has completed succesfully. Remove it from the in-progress list.
@@ -960,14 +914,8 @@ impl RemoteTimelineClient {
return;
}
};
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
file_kind.as_str(),
op_kind.as_str(),
])
.unwrap()
self.metrics
.unfinished_tasks(&file_kind, &op_kind)
.add(delta)
}
@@ -1032,34 +980,12 @@ impl RemoteTimelineClient {
}
}
///
/// Create a remote storage client for given timeline
///
/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
/// by calling init_upload_queue.
///
pub fn create_remote_timeline_client(
remote_storage: GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<RemoteTimelineClient> {
Ok(RemoteTimelineClient {
conf,
runtime: &BACKGROUND_RUNTIME,
tenant_id,
timeline_id,
storage_impl: remote_storage,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
use std::collections::HashSet;
use std::{collections::HashSet, path::Path};
use utils::lsn::Lsn;
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1083,15 +1009,11 @@ mod tests {
TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
}
fn assert_file_list(a: &HashSet<RelativePath>, b: &[&str]) {
let xx = PathBuf::from("");
let mut avec: Vec<String> = a
.iter()
.map(|x| x.to_local_path(&xx).to_string_lossy().into())
.collect();
fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
avec.sort();
let mut bvec = b.to_owned();
let mut bvec = b.to_vec();
bvec.sort_unstable();
assert_eq!(avec, bvec);
@@ -1159,8 +1081,7 @@ mod tests {
println!("workdir: {}", harness.conf.workdir.display());
let storage_impl =
GenericRemoteStorage::from_config(harness.conf.workdir.clone(), &storage_config)?;
let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
let client = Arc::new(RemoteTimelineClient {
conf: harness.conf,
runtime,
@@ -1168,6 +1089,10 @@ mod tests {
timeline_id: TIMELINE_ID,
storage_impl,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&harness.tenant_id,
&TIMELINE_ID,
)),
});
let remote_timeline_dir =
@@ -1184,11 +1109,11 @@ mod tests {
std::fs::write(timeline_path.join("bar"), &content_bar)?;
client.schedule_layer_file_upload(
&timeline_path.join("foo"),
&LayerFileName::Test("foo".to_owned()),
&LayerFileMetadata::new(content_foo.len() as u64),
)?;
client.schedule_layer_file_upload(
&timeline_path.join("bar"),
&LayerFileName::Test("bar".to_owned()),
&LayerFileMetadata::new(content_bar.len() as u64),
)?;
@@ -1199,15 +1124,19 @@ mod tests {
assert!(upload_queue.queued_operations.is_empty());
assert!(upload_queue.inprogress_tasks.len() == 2);
assert!(upload_queue.num_inprogress_layer_uploads == 2);
// also check that `latest_file_changes` was updated
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
}
// Schedule upload of index. Check that it is queued
let metadata = dummy_metadata(Lsn(0x20));
client.schedule_index_upload(&metadata)?;
client.schedule_index_upload_for_metadata_update(&metadata)?;
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
assert!(upload_queue.queued_operations.len() == 1);
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
}
// Wait for the uploads to finish
@@ -1230,10 +1159,10 @@ mod tests {
let content_baz = dummy_contents("baz");
std::fs::write(timeline_path.join("baz"), &content_baz)?;
client.schedule_layer_file_upload(
&timeline_path.join("baz"),
&LayerFileName::Test("baz".to_owned()),
&LayerFileMetadata::new(content_baz.len() as u64),
)?;
client.schedule_layer_file_deletion(&[timeline_path.join("foo")])?;
client.schedule_layer_file_deletion(&[LayerFileName::Test("foo".to_owned())])?;
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
@@ -1243,6 +1172,7 @@ mod tests {
assert!(upload_queue.inprogress_tasks.len() == 1);
assert!(upload_queue.num_inprogress_layer_uploads == 1);
assert!(upload_queue.num_inprogress_deletions == 0);
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
}
assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);

View File

@@ -0,0 +1,28 @@
//! Helper functions to delete files from remote storage with a RemoteStorage
use anyhow::Context;
use std::path::Path;
use tracing::debug;
use remote_storage::GenericRemoteStorage;
use crate::config::PageServerConf;
pub(super) async fn delete_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
local_layer_path: &'a Path,
) -> anyhow::Result<()> {
fail::fail_point!("before-delete-layer", |_| {
anyhow::bail!("failpoint before-delete-layer")
});
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
let path_to_delete = conf.remote_path(local_layer_path)?;
// XXX: If the deletion fails because the object already didn't exist,
// it would be good to just issue a warning but consider it success.
// https://github.com/neondatabase/neon/issues/2934
storage.delete(&path_to_delete).await.with_context(|| {
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
})
}

View File

@@ -0,0 +1,331 @@
//! Helper functions to download files from remote storage with a RemoteStorage
//!
//! The functions in this module retry failed operations automatically, according
//! to the FAILED_DOWNLOAD_RETRIES constant.
use std::collections::HashSet;
use std::future::Future;
use std::path::Path;
use anyhow::{anyhow, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tracing::{debug, error, info, info_span, warn, Instrument};
use crate::config::PageServerConf;
use crate::tenant::storage_layer::LayerFileName;
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
use remote_storage::{DownloadError, GenericRemoteStorage};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
fs::File::open(path).await?.sync_all().await
}
///
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
///
/// Returns the size of the downloaded file.
pub async fn download_layer_file<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
tenant_id: TenantId,
timeline_id: TimelineId,
layer_file_name: &'a LayerFileName,
layer_metadata: &'a LayerFileMetadata,
) -> Result<u64, DownloadError> {
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
let local_path = timeline_path.join(layer_file_name.file_name());
let remote_path = conf
.remote_path(&local_path)
.map_err(DownloadError::Other)?;
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
// write(tmp)
// fsync(tmp)
// rename(tmp, new)
// fsync(new)
// fsync(parent)
// For more context about durable_rename check this email from postgres mailing list:
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
let (mut destination_file, bytes_amount) = download_retry(
|| async {
// TODO: this doesn't use the cached fd for some reason?
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
format!(
"Failed to create a destination file for layer '{}'",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
let mut download = storage.download(&remote_path).await.with_context(|| {
format!(
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
)
})
.map_err(DownloadError::Other)?;
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
})
.map_err(DownloadError::Other)?;
Ok((destination_file, bytes_amount))
},
&format!("download {remote_path:?}"),
).await?;
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
// A file will not be closed immediately when it goes out of scope if there are any IO operations
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
// you should call flush before dropping it.
//
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
// But for additional safety lets check/wait for any pending operations.
destination_file
.flush()
.await
.with_context(|| {
format!(
"failed to flush source file at {}",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
match layer_metadata.file_size() {
Some(expected) if expected != bytes_amount => {
return Err(DownloadError::Other(anyhow!(
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
temp_file_path.display()
)));
}
Some(_) | None => {
// matches, or upgrading from an earlier IndexPart version
}
}
// not using sync_data because it can lose file size update
destination_file
.sync_all()
.await
.with_context(|| {
format!(
"failed to fsync source file at {}",
temp_file_path.display()
)
})
.map_err(DownloadError::Other)?;
drop(destination_file);
fail::fail_point!("remote-storage-download-pre-rename", |_| {
Err(DownloadError::Other(anyhow!(
"remote-storage-download-pre-rename failpoint triggered"
)))
});
fs::rename(&temp_file_path, &local_path)
.await
.with_context(|| {
format!(
"Could not rename download layer file to {}",
local_path.display(),
)
})
.map_err(DownloadError::Other)?;
fsync_path(&local_path)
.await
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
.map_err(DownloadError::Other)?;
tracing::info!("download complete: {}", local_path.display());
Ok(bytes_amount)
}
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
pub fn is_temp_download_file(path: &Path) -> bool {
let extension = path.extension().map(|pname| {
pname
.to_str()
.expect("paths passed to this function must be valid Rust strings")
});
match extension {
Some(TEMP_DOWNLOAD_EXTENSION) => true,
Some(_) => false,
None => false,
}
}
/// List timelines of given tenant in remote storage
pub async fn list_remote_timelines<'a>(
storage: &'a GenericRemoteStorage,
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
let tenant_path = conf.timelines_path(&tenant_id);
let tenant_storage_path = conf.remote_path(&tenant_path)?;
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
anyhow::bail!("storage-sync-list-remote-timelines");
});
let timelines = download_retry(
|| storage.list_prefixes(Some(&tenant_storage_path)),
&format!("list prefixes for {tenant_path:?}"),
)
.await?;
if timelines.is_empty() {
anyhow::bail!("no timelines found on the remote storage")
}
let mut timeline_ids = HashSet::new();
let mut part_downloads = FuturesUnordered::new();
for timeline_remote_storage_key in timelines {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?;
let timeline_id: TimelineId = object_name.parse().with_context(|| {
format!("failed to parse object name into timeline id '{object_name}'")
})?;
// list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
// yet, launch a download task for it.
if !timeline_ids.contains(&timeline_id) {
timeline_ids.insert(timeline_id);
let storage_clone = storage.clone();
part_downloads.push(async move {
(
timeline_id,
download_index_part(conf, &storage_clone, tenant_id, timeline_id)
.instrument(info_span!("download_index_part", timeline=%timeline_id))
.await,
)
});
}
}
// Wait for all the download tasks to complete.
let mut timeline_parts = Vec::new();
while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
let index_part = part_upload_result
.with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
debug!("Successfully fetched index part for timeline {timeline_id}");
timeline_parts.push((timeline_id, index_part));
}
Ok(timeline_parts)
}
pub async fn download_index_part(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<IndexPart, DownloadError> {
let index_part_path = conf
.metadata_path(timeline_id, tenant_id)
.with_file_name(IndexPart::FILE_NAME);
let part_storage_path = conf
.remote_path(&index_part_path)
.map_err(DownloadError::BadInput)?;
let index_part_bytes = download_retry(
|| async {
let mut index_part_download = storage.download(&part_storage_path).await?;
let mut index_part_bytes = Vec::new();
tokio::io::copy(
&mut index_part_download.download_stream,
&mut index_part_bytes,
)
.await
.with_context(|| {
format!("Failed to download an index part into file {index_part_path:?}")
})
.map_err(DownloadError::Other)?;
Ok(index_part_bytes)
},
&format!("download {part_storage_path:?}"),
)
.await?;
let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
.with_context(|| {
format!("Failed to deserialize index part file into file {index_part_path:?}")
})
.map_err(DownloadError::Other)?;
let index_part = index_part.remove_unclean_layer_file_names();
Ok(index_part)
}
///
/// Helper function to handle retries for a download operation.
///
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
/// with backoff.
///
/// (See similar logic for uploads in `perform_upload_task`)
async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
where
O: FnMut() -> F,
F: Future<Output = Result<T, DownloadError>>,
{
let mut attempts = 0;
loop {
let result = op().await;
match result {
Ok(_) => {
if attempts > 0 {
info!("{description} succeeded after {attempts} retries");
}
return result;
}
// These are "permanent" errors that should not be retried.
Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
return result;
}
// Assume that any other failure might be transient, and the operation might
// succeed if we just keep trying.
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
info!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(DownloadError::Other(ref err)) => {
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
error!("{description} still failed after {attempts} retries, giving up: {err:?}");
return result;
}
}
// sleep and retry
exponential_backoff(
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
)
.await;
attempts += 1;
}
}

View File

@@ -2,46 +2,16 @@
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
//! remote timeline layers and its metadata.
use std::{
collections::{HashMap, HashSet},
path::{Path, PathBuf},
};
use std::collections::{HashMap, HashSet};
use anyhow::{Context, Ok};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use tracing::warn;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::{metadata::TimelineMetadata, storage_layer::LayerFileName};
use utils::lsn::Lsn;
/// A part of the filesystem path, that needs a root to become a path again.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct RelativePath(String);
impl RelativePath {
/// Attempts to strip off the base from path, producing a relative path or an error.
pub fn from_local_path(timeline_path: &Path, path: &Path) -> anyhow::Result<RelativePath> {
let relative = path.strip_prefix(timeline_path).with_context(|| {
format!(
"path '{}' is not relative to base '{}'",
path.display(),
timeline_path.display()
)
})?;
Ok(Self::from_filename(relative))
}
pub fn from_filename(path: &Path) -> RelativePath {
RelativePath(path.to_string_lossy().to_string())
}
pub fn to_local_path(&self, timeline_path: &Path) -> PathBuf {
timeline_path.join(&self.0)
}
}
/// Metadata gathered for each of the layer files.
///
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -78,9 +48,17 @@ impl LayerFileMetadata {
/// Metadata has holes due to version upgrades. This method is called to upgrade self with the
/// other value.
///
/// This is called on the possibly outdated version.
pub fn merge(&mut self, other: &Self) {
self.file_size = other.file_size.or(self.file_size);
/// This is called on the possibly outdated version. Returns true if any changes
/// were made.
pub fn merge(&mut self, other: &Self) -> bool {
let mut changed = false;
if self.file_size != other.file_size {
self.file_size = other.file_size.or(self.file_size);
changed = true;
}
changed
}
}
@@ -92,26 +70,25 @@ impl LayerFileMetadata {
/// remember to add a test case for the changed version.
#[serde_as]
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct IndexPart {
pub struct IndexPartImpl<L>
where
L: std::hash::Hash + PartialEq + Eq,
{
/// Debugging aid describing the version of this type.
#[serde(default)]
version: usize,
/// Each of the layers present on remote storage.
/// Layer names, which are stored on the remote storage.
///
/// Additional metadata can might exist in `layer_metadata`.
pub timeline_layers: HashSet<RelativePath>,
pub timeline_layers: HashSet<L>,
/// FIXME: unused field. This should be removed, but that changes the on-disk format,
/// so we need to make sure we're backwards- (and maybe forwards-) compatible
missing_layers: HashSet<RelativePath>,
/// Per layer file metadata, which can be present for a present or missing layer file.
/// Per layer file name metadata, which can be present for a present or missing layer file.
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
/// that latest version stores.
#[serde(default)]
pub layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
#[serde(default = "HashMap::default")]
pub layer_metadata: HashMap<L, IndexLayerMetadata>,
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated here for convenience.
@@ -120,6 +97,101 @@ pub struct IndexPart {
metadata_bytes: Vec<u8>,
}
// TODO seems like another part of the remote storage file format
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
pub type IndexPart = IndexPartImpl<LayerFileName>;
pub type IndexPartUnclean = IndexPartImpl<UncleanLayerFileName>;
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub enum UncleanLayerFileName {
Clean(LayerFileName),
BackupFile(String),
}
impl<'de> serde::Deserialize<'de> for UncleanLayerFileName {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
deserializer.deserialize_string(UncleanLayerFileNameVisitor)
}
}
struct UncleanLayerFileNameVisitor;
impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
type Value = UncleanLayerFileName;
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(
formatter,
"a string that is a valid LayerFileName or '.old' backup file name"
)
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
let maybe_clean: Result<LayerFileName, _> = v.parse();
match maybe_clean {
Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
Err(e) => {
if v.ends_with(".old") || v == "metadata_backup" {
Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
} else {
Err(E::custom(e))
}
}
}
}
}
impl UncleanLayerFileName {
fn into_clean(self) -> Option<LayerFileName> {
match self {
UncleanLayerFileName::Clean(clean) => Some(clean),
UncleanLayerFileName::BackupFile(_) => None,
}
}
}
impl IndexPartUnclean {
pub fn remove_unclean_layer_file_names(self) -> IndexPart {
let IndexPartUnclean {
version,
timeline_layers,
layer_metadata,
disk_consistent_lsn,
metadata_bytes,
} = self;
IndexPart {
version,
timeline_layers: timeline_layers
.into_iter()
.filter_map(|unclean_file_name| match unclean_file_name {
UncleanLayerFileName::Clean(clean_name) => Some(clean_name),
UncleanLayerFileName::BackupFile(backup_file_name) => {
// For details see https://github.com/neondatabase/neon/issues/3024
warn!(
"got backup file on the remote storage, ignoring it {backup_file_name}"
);
None
}
})
.collect(),
layer_metadata: layer_metadata
.into_iter()
.filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
.collect(),
disk_consistent_lsn,
metadata_bytes,
}
}
}
impl IndexPart {
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
/// used to understand later versions.
@@ -129,23 +201,22 @@ impl IndexPart {
pub const FILE_NAME: &'static str = "index_part.json";
pub fn new(
layers_and_metadata: HashMap<RelativePath, LayerFileMetadata>,
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
disk_consistent_lsn: Lsn,
metadata_bytes: Vec<u8>,
) -> Self {
let mut timeline_layers = HashSet::new();
let mut layer_metadata = HashMap::new();
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
separate_paths_and_metadata(
&layers_and_metadata,
&mut timeline_layers,
&mut layer_metadata,
);
for (remote_name, metadata) in &layers_and_metadata {
timeline_layers.insert(remote_name.to_owned());
let metadata = IndexLayerMetadata::from(metadata);
layer_metadata.insert(remote_name.to_owned(), metadata);
}
Self {
version: Self::LATEST_VERSION,
timeline_layers,
missing_layers: HashSet::new(),
layer_metadata,
disk_consistent_lsn,
metadata_bytes,
@@ -160,7 +231,7 @@ impl IndexPart {
/// Serialized form of [`LayerFileMetadata`].
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
pub struct IndexLayerMetadata {
file_size: Option<u64>,
pub(super) file_size: Option<u64>,
}
impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
@@ -171,18 +242,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
}
}
fn separate_paths_and_metadata(
input: &HashMap<RelativePath, LayerFileMetadata>,
output: &mut HashSet<RelativePath>,
layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
) {
for (path, metadata) in input {
let metadata = IndexLayerMetadata::from(metadata);
layer_metadata.insert(path.clone(), metadata);
output.insert(path.clone());
}
}
#[cfg(test)]
mod tests {
use super::*;
@@ -191,21 +250,20 @@ mod tests {
fn v0_indexpart_is_parsed() {
let example = r#"{
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
"missing_layers":["not_a_real_layer_but_adding_coverage"],
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
}"#;
let expected = IndexPart {
version: 0,
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
layer_metadata: HashMap::default(),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
};
let part = serde_json::from_str::<IndexPart>(example).unwrap();
let part: IndexPartUnclean = serde_json::from_str(example).unwrap();
let part = part.remove_unclean_layer_file_names();
assert_eq!(part, expected);
}
@@ -214,10 +272,9 @@ mod tests {
let example = r#"{
"version":1,
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
"missing_layers":["not_a_real_layer_but_adding_coverage"],
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
"LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -226,13 +283,12 @@ mod tests {
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
layer_metadata: HashMap::from([
(RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: Some(25600000),
}),
(RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
(LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: Some(9007199254741001),
@@ -242,7 +298,46 @@ mod tests {
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
};
let part = serde_json::from_str::<IndexPart>(example).unwrap();
let part = serde_json::from_str::<IndexPartUnclean>(example)
.unwrap()
.remove_unclean_layer_file_names();
assert_eq!(part, expected);
}
#[test]
fn v1_indexpart_is_parsed_with_optional_missing_layers() {
let example = r#"{
"version":1,
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
"missing_layers":["This shouldn't fail deserialization"],
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
}"#;
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: Some(25600000),
}),
(LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: Some(9007199254741001),
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
};
let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
let part = part.remove_unclean_layer_file_names();
assert_eq!(part, expected);
}
}

View File

@@ -5,12 +5,12 @@ use fail::fail_point;
use std::path::Path;
use tokio::fs;
use super::index::IndexPart;
use crate::config::PageServerConf;
use crate::storage_sync::LayerFileMetadata;
use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
use remote_storage::GenericRemoteStorage;
use utils::id::{TenantId, TimelineId};
use super::index::LayerFileMetadata;
/// Serializes and uploads the given index part data to the remote storage.
pub(super) async fn upload_index_part<'a>(
conf: &'static PageServerConf,
@@ -30,12 +30,9 @@ pub(super) async fn upload_index_part<'a>(
let index_part_path = conf
.metadata_path(timeline_id, tenant_id)
.with_file_name(IndexPart::FILE_NAME);
let storage_path = conf.remote_path(&index_part_path)?;
storage
.upload_storage_object(
Box::new(index_part_bytes),
index_part_size,
&index_part_path,
)
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
.await
.with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
}
@@ -44,36 +41,26 @@ pub(super) async fn upload_index_part<'a>(
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
///
/// On an error, bumps the retries count and reschedules the entire task.
pub(super) async fn upload_timeline_layer(
storage: &GenericRemoteStorage,
source_path: &Path,
known_metadata: &LayerFileMetadata,
pub(super) async fn upload_timeline_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
source_path: &'a Path,
known_metadata: &'a LayerFileMetadata,
) -> anyhow::Result<()> {
fail_point!("before-upload-layer", |_| {
bail!("failpoint before-upload-layer")
});
let storage_path = storage.remote_object_id(source_path).with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
source_path.display()
)
})?;
let storage_path = conf.remote_path(source_path)?;
let source_file = fs::File::open(&source_path).await.with_context(|| {
format!(
"Failed to open a source file for layer '{}'",
source_path.display()
)
})?;
let source_file = fs::File::open(&source_path)
.await
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;
let fs_size = source_file
.metadata()
.await
.with_context(|| {
format!(
"Failed to get the source file metadata for layer '{}'",
source_path.display()
)
format!("Failed to get the source file metadata for layer {source_path:?}")
})?
.len();

View File

@@ -3,8 +3,11 @@ use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use anyhow::Context;
use tokio::sync::oneshot::error::RecvError;
use tokio::sync::Semaphore;
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use super::Tenant;
use utils::id::TimelineId;
use utils::lsn::Lsn;
@@ -67,6 +70,7 @@ pub(super) async fn gather_inputs(
let timelines = tenant
.refresh_gc_info()
.await
.context("Failed to refresh gc_info before gathering inputs")?;
if timelines.is_empty() {
@@ -93,8 +97,6 @@ pub(super) async fn gather_inputs(
// used to determine the `retention_period` for the size model
let mut max_cutoff_distance = None;
// this will probably conflict with on-demand downloaded layers, or at least force them all
// to be downloaded
for timeline in timelines {
let last_record_lsn = timeline.get_last_record_lsn();
@@ -212,11 +214,30 @@ pub(super) async fn gather_inputs(
let mut have_any_error = false;
while let Some(res) = joinset.join_next().await {
// each of these come with Result<Result<_, JoinError>, JoinError>
// each of these come with Result<anyhow::Result<_>, JoinError>
// because of spawn + spawn_blocking
let res = res.and_then(|inner| inner);
match res {
Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures, nor should be");
}
Err(join_error) => {
// cannot really do anything, as this panic is likely a bug
error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
have_any_error = true;
}
Ok(Err(recv_result_error)) => {
// cannot really do anything, as this panic is likely a bug
error!("failed to receive logical size query result: {recv_result_error:#}");
have_any_error = true;
}
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
warn!(
timeline_id=%timeline.timeline_id,
"failed to calculate logical size at {lsn}: {error:#}"
);
have_any_error = true;
}
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
logical_size_cache.insert((timeline.timeline_id, lsn), size);
@@ -228,21 +249,6 @@ pub(super) async fn gather_inputs(
command: Command::Update(size),
});
}
Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
warn!(
timeline_id=%timeline.timeline_id,
"failed to calculate logical size at {lsn}: {error:#}"
);
have_any_error = true;
}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures, nor should be");
}
Err(join_error) => {
// cannot really do anything, as this panic is likely a bug
error!("logical size query panicked: {join_error:#}");
have_any_error = true;
}
}
}
@@ -351,7 +357,7 @@ enum LsnKind {
struct TimelineAtLsnSizeResult(
Arc<crate::tenant::Timeline>,
utils::lsn::Lsn,
anyhow::Result<u64>,
Result<u64, CalculateLogicalSizeError>,
);
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
@@ -359,17 +365,15 @@ async fn calculate_logical_size(
limit: Arc<tokio::sync::Semaphore>,
timeline: Arc<crate::tenant::Timeline>,
lsn: utils::lsn::Lsn,
) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
let permit = tokio::sync::Semaphore::acquire_owned(limit)
) -> Result<TimelineAtLsnSizeResult, RecvError> {
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
.await
.expect("global semaphore should not had been closed");
tokio::task::spawn_blocking(move || {
let _permit = permit;
let size_res = timeline.calculate_logical_size(lsn);
TimelineAtLsnSizeResult(timeline, lsn, size_res)
})
.await
let size_res = timeline
.spawn_ondemand_logical_size_calculation(lsn)
.await?;
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
}
#[test]

View File

@@ -1,6 +1,10 @@
//!
//! Common traits and structs for layers
//!
mod delta_layer;
mod filename;
mod image_layer;
mod inmemory_layer;
mod remote_layer;
use crate::repository::{Key, Value};
use crate::walrecord::NeonWalRecord;
@@ -8,12 +12,19 @@ use anyhow::Result;
use bytes::Bytes;
use std::ops::Range;
use std::path::PathBuf;
use std::sync::Arc;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
pub use filename::{DeltaFileName, ImageFileName, LayerFileName, PathOrConf};
pub use image_layer::{ImageLayer, ImageLayerWriter};
pub use inmemory_layer::InMemoryLayer;
pub use remote_layer::RemoteLayer;
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
where
T: PartialOrd<T>,
@@ -69,26 +80,9 @@ pub enum ValueReconstructResult {
Missing,
}
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
/// range of LSNs.
///
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
/// layers are used to ingest incoming WAL, and provide fast access to the
/// recent page versions. On-disk layers are stored as files on disk, and are
/// immutable. This trait presents the common functionality of in-memory and
/// on-disk layers.
///
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
/// A delta layer contains all modifications within a range of LSNs and keys.
/// An image layer is a snapshot of all the data in a key-range, at a single
/// LSN
///
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
/// required by [`LayerMap`].
pub trait Layer: Send + Sync {
fn get_tenant_id(&self) -> TenantId;
/// Identify the timeline this layer belongs to
fn get_timeline_id(&self) -> TimelineId;
/// Range of keys that this layer covers
fn get_key_range(&self) -> Range<Key>;
@@ -100,13 +94,11 @@ pub trait Layer: Send + Sync {
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
fn get_lsn_range(&self) -> Range<Lsn>;
/// Filename used to store this layer on disk. (Even in-memory layers
/// implement this, to print a handy unique identifier for the layer for
/// log messages, even though they're never not on disk.)
fn filename(&self) -> PathBuf;
/// If a layer has a corresponding file on a local filesystem, return its absolute path.
fn local_path(&self) -> Option<PathBuf>;
/// Does this layer only contain some data for the key-range (incremental),
/// or does it contain a version of every page? This is important to know
/// for garbage collecting old layers: an incremental layer depends on
/// the previous non-incremental layer.
fn is_incremental(&self) -> bool;
///
/// Return data needed to reconstruct given page at LSN.
@@ -127,35 +119,88 @@ pub trait Layer: Send + Sync {
reconstruct_data: &mut ValueReconstructState,
) -> Result<ValueReconstructResult>;
/// Does this layer only contain some data for the key-range (incremental),
/// or does it contain a version of every page? This is important to know
/// for garbage collecting old layers: an incremental layer depends on
/// the previous non-incremental layer.
fn is_incremental(&self) -> bool;
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
fn short_id(&self) -> String;
/// Returns true for layers that are represented in memory.
fn is_in_memory(&self) -> bool;
/// Dump summary of the contents of the layer to stdout
fn dump(&self, verbose: bool) -> Result<()>;
}
/// Returned by [`Layer::iter`]
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
/// Returned by [`Layer::key_iter`]
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
/// range of LSNs.
///
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
/// layers are used to ingest incoming WAL, and provide fast access to the
/// recent page versions. On-disk layers are stored as files on disk, and are
/// immutable. This trait presents the common functionality of in-memory and
/// on-disk layers.
///
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
/// A delta layer contains all modifications within a range of LSNs and keys.
/// An image layer is a snapshot of all the data in a key-range, at a single
/// LSN
///
pub trait PersistentLayer: Layer {
fn get_tenant_id(&self) -> TenantId;
/// Identify the timeline this layer belongs to
fn get_timeline_id(&self) -> TimelineId;
/// File name used for this layer, both in the pageserver's local filesystem
/// state as well as in the remote storage.
fn filename(&self) -> LayerFileName;
// Path to the layer file in the local filesystem.
// `None` for `RemoteLayer`.
fn local_path(&self) -> Option<PathBuf>;
/// Iterate through all keys and values stored in the layer
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
fn iter(&self) -> Result<LayerIter<'_>>;
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
/// It is used only for compaction and so is currently implemented only for DeltaLayer
fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
panic!("Not implemented")
}
/// Permanently remove this layer from disk.
fn delete(&self) -> Result<()>;
/// Dump summary of the contents of the layer to stdout
fn dump(&self, verbose: bool) -> Result<()>;
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
None
}
fn is_remote_layer(&self) -> bool {
false
}
/// Returns None if the layer file size is not known.
///
/// Should not change over the lifetime of the layer object because
/// current_physical_size is computed as the som of this value.
fn file_size(&self) -> Option<u64>;
}
pub fn downcast_remote_layer(
layer: &Arc<dyn PersistentLayer>,
) -> Option<std::sync::Arc<RemoteLayer>> {
if layer.is_remote_layer() {
Arc::clone(layer).downcast_remote_layer()
} else {
None
}
}
impl std::fmt::Debug for dyn Layer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Layer")
.field("filename", &self.filename())
.field("short_id", &self.short_id())
.finish()
}
}

View File

@@ -29,15 +29,16 @@ use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::filename::{DeltaFileName, PathOrConf};
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::storage_layer::{
PersistentLayer, ValueReconstructResult, ValueReconstructState,
};
use crate::virtual_file::VirtualFile;
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{bail, ensure, Context, Result};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::fs;
use std::fs::{self, File};
use std::io::{BufWriter, Write};
use std::io::{Seek, SeekFrom};
use std::ops::Range;
@@ -52,6 +53,8 @@ use utils::{
lsn::Lsn,
};
use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOrConf};
///
/// Header stored in the beginning of the file
///
@@ -178,6 +181,8 @@ pub struct DeltaLayer {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
pub file_size: u64,
inner: RwLock<DeltaLayerInner>,
}
@@ -194,14 +199,6 @@ pub struct DeltaLayerInner {
}
impl Layer for DeltaLayer {
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
@@ -209,13 +206,86 @@ impl Layer for DeltaLayer {
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
fn filename(&self) -> PathBuf {
PathBuf::from(self.layer_name().to_string())
fn is_incremental(&self) -> bool {
true
}
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
fn short_id(&self) -> String {
self.filename().file_name()
}
/// debugging function to print out the contents of the layer
fn dump(&self, verbose: bool) -> Result<()> {
println!(
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.tenant_id,
self.timeline_id,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
if !verbose {
return Ok(());
}
let inner = self.load()?;
println!(
"index_start_blk: {}, root {}",
inner.index_start_blk, inner.index_root_blk
);
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
file,
);
tree_reader.dump()?;
let mut cursor = file.block_cursor();
// A subroutine to dump a single blob
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
let buf = cursor.read_blob(blob_ref.pos())?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
format!(" img {} bytes", img.len())
}
Value::WalRecord(rec) => {
let wal_desc = walrecord::describe_wal_record(&rec)?;
format!(
" rec {} bytes will_init: {} {}",
buf.len(),
rec.will_init(),
wal_desc
)
}
};
Ok(desc)
};
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|delta_key, val| {
let blob_ref = BlobRef(val);
let key = DeltaKey::extract_key_from_buf(delta_key);
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
let desc = match dump_blob(blob_ref) {
Ok(desc) => desc,
Err(err) => format!("ERROR: {}", err),
};
println!(" key {} at {}: {}", key, lsn, desc);
true
},
)?;
Ok(())
}
fn get_value_reconstruct_data(
@@ -302,29 +372,38 @@ impl Layer for DeltaLayer {
Ok(ValueReconstructResult::Complete)
}
}
}
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
let inner = match self.load() {
Ok(inner) => inner,
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
};
match DeltaValueIter::new(inner) {
Ok(iter) => Box::new(iter),
Err(err) => Box::new(std::iter::once(Err(err))),
}
impl PersistentLayer for DeltaLayer {
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
let inner = match self.load() {
Ok(inner) => inner,
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
};
fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
match DeltaKeyIter::new(inner) {
fn filename(&self) -> LayerFileName {
self.layer_name().into()
}
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
fn iter(&self) -> Result<LayerIter<'_>> {
let inner = self.load().context("load delta layer")?;
Ok(match DeltaValueIter::new(inner) {
Ok(iter) => Box::new(iter),
Err(e) => panic!("Layer index is corrupted: {e:?}"),
}
Err(err) => Box::new(std::iter::once(Err(err))),
})
}
fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
let inner = self.load()?;
Ok(Box::new(
DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
))
}
fn delete(&self) -> Result<()> {
@@ -333,87 +412,8 @@ impl Layer for DeltaLayer {
Ok(())
}
fn is_incremental(&self) -> bool {
true
}
fn is_in_memory(&self) -> bool {
false
}
/// debugging function to print out the contents of the layer
fn dump(&self, verbose: bool) -> Result<()> {
println!(
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.tenant_id,
self.timeline_id,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
if !verbose {
return Ok(());
}
let inner = self.load()?;
println!(
"index_start_blk: {}, root {}",
inner.index_start_blk, inner.index_root_blk
);
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
file,
);
tree_reader.dump()?;
let mut cursor = file.block_cursor();
// A subroutine to dump a single blob
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
let buf = cursor.read_blob(blob_ref.pos())?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
format!(" img {} bytes", img.len())
}
Value::WalRecord(rec) => {
let wal_desc = walrecord::describe_wal_record(&rec)?;
format!(
" rec {} bytes will_init: {} {}",
buf.len(),
rec.will_init(),
wal_desc
)
}
};
Ok(desc)
};
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|delta_key, val| {
let blob_ref = BlobRef(val);
let key = DeltaKey::extract_key_from_buf(delta_key);
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
let desc = match dump_blob(blob_ref) {
Ok(desc) => desc,
Err(err) => format!("ERROR: {}", err),
};
println!(" key {} at {}: {}", key, lsn, desc);
true
},
)?;
Ok(())
fn file_size(&self) -> Option<u64> {
Some(self.file_size)
}
}
@@ -511,8 +511,8 @@ impl DeltaLayer {
}
}
PathOrConf::Path(path) => {
let actual_filename = Path::new(path.file_name().unwrap());
let expected_filename = self.filename();
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
let expected_filename = self.filename().file_name();
if actual_filename != expected_filename {
println!(
@@ -539,6 +539,7 @@ impl DeltaLayer {
timeline_id: TimelineId,
tenant_id: TenantId,
filename: &DeltaFileName,
file_size: u64,
) -> DeltaLayer {
DeltaLayer {
path_or_conf: PathOrConf::Conf(conf),
@@ -546,6 +547,7 @@ impl DeltaLayer {
tenant_id,
key_range: filename.key_range.clone(),
lsn_range: filename.lsn_range.clone(),
file_size,
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,
@@ -558,21 +560,23 @@ impl DeltaLayer {
/// Create a DeltaLayer struct representing an existing file on disk.
///
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
where
F: FileExt,
{
pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
let mut summary_buf = Vec::new();
summary_buf.resize(PAGE_SZ, 0);
file.read_exact_at(&mut summary_buf, 0)?;
let summary = Summary::des_prefix(&summary_buf)?;
let metadata = file
.metadata()
.context("get file metadata to determine size")?;
Ok(DeltaLayer {
path_or_conf: PathOrConf::Path(path.to_path_buf()),
timeline_id: summary.timeline_id,
tenant_id: summary.tenant_id,
key_range: summary.key_range,
lsn_range: summary.lsn_range,
file_size: metadata.len(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,
@@ -729,6 +733,10 @@ impl DeltaLayerWriterInner {
file.seek(SeekFrom::Start(0))?;
Summary::ser_into(&summary, &mut file)?;
let metadata = file
.metadata()
.context("get file metadata to determine size")?;
// Note: Because we opened the file in write-only mode, we cannot
// reuse the same VirtualFile for reading later. That's why we don't
// set inner.file here. The first read will have to re-open it.
@@ -738,6 +746,7 @@ impl DeltaLayerWriterInner {
timeline_id: self.timeline_id,
key_range: self.key_start..key_end,
lsn_range: self.lsn_range.clone(),
file_size: metadata.len(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,

View File

@@ -7,11 +7,12 @@ use std::cmp::Ordering;
use std::fmt;
use std::ops::Range;
use std::path::PathBuf;
use std::str::FromStr;
use utils::lsn::Lsn;
// Note: Timeline::load_layer_map() relies on this sort order
#[derive(Debug, PartialEq, Eq, Clone)]
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub struct DeltaFileName {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
@@ -101,7 +102,7 @@ impl fmt::Display for DeltaFileName {
}
}
#[derive(Debug, PartialEq, Eq, Clone)]
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub struct ImageFileName {
pub key_range: Range<Key>,
pub lsn: Lsn,
@@ -172,6 +173,103 @@ impl fmt::Display for ImageFileName {
)
}
}
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub enum LayerFileName {
Image(ImageFileName),
Delta(DeltaFileName),
#[cfg(test)]
Test(String),
}
impl LayerFileName {
pub fn file_name(&self) -> String {
match self {
LayerFileName::Image(fname) => format!("{fname}"),
LayerFileName::Delta(fname) => format!("{fname}"),
#[cfg(test)]
LayerFileName::Test(fname) => fname.to_string(),
}
}
#[cfg(test)]
pub(crate) fn new_test(name: &str) -> LayerFileName {
LayerFileName::Test(name.to_owned())
}
}
impl From<ImageFileName> for LayerFileName {
fn from(fname: ImageFileName) -> Self {
LayerFileName::Image(fname)
}
}
impl From<DeltaFileName> for LayerFileName {
fn from(fname: DeltaFileName) -> Self {
LayerFileName::Delta(fname)
}
}
// include a `/` in the name as an additional layer of robustness
// because `/` chars are not allowed in UNIX paths
#[cfg(test)]
const LAYER_FILE_NAME_TEST_PREFIX: &str = "LAYER_FILE_NAME::test/";
impl FromStr for LayerFileName {
type Err = String;
fn from_str(value: &str) -> Result<Self, Self::Err> {
#[cfg(test)]
if let Some(value) = value.strip_prefix(LAYER_FILE_NAME_TEST_PREFIX) {
return Ok(LayerFileName::Test(value.to_owned()));
}
let delta = DeltaFileName::parse_str(value);
let image = ImageFileName::parse_str(value);
let ok = match (delta, image) {
(None, None) => {
return Err(format!(
"neither delta nor image layer file name: {value:?}"
))
}
(Some(delta), None) => LayerFileName::Delta(delta),
(None, Some(image)) => LayerFileName::Image(image),
(Some(_), Some(_)) => unreachable!(),
};
Ok(ok)
}
}
impl serde::Serialize for LayerFileName {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
match self {
LayerFileName::Image(fname) => serializer.serialize_str(&format!("{}", fname)),
LayerFileName::Delta(fname) => serializer.serialize_str(&format!("{}", fname)),
#[cfg(test)]
LayerFileName::Test(t) => {
serializer.serialize_str(&format!("{LAYER_FILE_NAME_TEST_PREFIX}{t}"))
}
}
}
}
struct LayerFileNameVisitor;
impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
type Value = LayerFileName;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(
formatter,
"a string that is a valid image or delta layer file name"
)
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
v.parse().map_err(|e| E::custom(e))
}
}
/// Helper enum to hold a PageServerConf, or a path
///

View File

@@ -21,12 +21,13 @@
//! actual page images are stored in the "values" part.
use crate::config::PageServerConf;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value, KEY_SIZE};
use crate::repository::{Key, KEY_SIZE};
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::filename::{ImageFileName, PathOrConf};
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::storage_layer::{
PersistentLayer, ValueReconstructResult, ValueReconstructState,
};
use crate::virtual_file::VirtualFile;
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{bail, ensure, Context, Result};
@@ -34,10 +35,11 @@ use bytes::Bytes;
use hex;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::fs;
use std::fs::{self, File};
use std::io::Write;
use std::io::{Seek, SeekFrom};
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::path::{Path, PathBuf};
use std::sync::{RwLock, RwLockReadGuard};
use tracing::*;
@@ -48,6 +50,9 @@ use utils::{
lsn::Lsn,
};
use super::filename::{ImageFileName, LayerFileName, PathOrConf};
use super::{Layer, LayerIter};
///
/// Header stored in the beginning of the file
///
@@ -100,6 +105,7 @@ pub struct ImageLayer {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
pub file_size: u64,
// This entry contains an image of all pages as of this LSN
pub lsn: Lsn,
@@ -120,22 +126,6 @@ pub struct ImageLayerInner {
}
impl Layer for ImageLayer {
fn filename(&self) -> PathBuf {
PathBuf::from(self.layer_name().to_string())
}
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
@@ -144,58 +134,12 @@ impl Layer for ImageLayer {
// End-bound is exclusive
self.lsn..(self.lsn + 1)
}
/// Look up given page in the file
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
) -> anyhow::Result<ValueReconstructResult> {
assert!(self.key_range.contains(&key));
assert!(lsn_range.start >= self.lsn);
assert!(lsn_range.end >= self.lsn);
let inner = self.load()?;
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
if let Some(offset) = tree_reader.get(&keybuf)? {
let blob = file.block_cursor().read_blob(offset).with_context(|| {
format!(
"failed to read value from data file {} at offset {}",
self.filename().display(),
offset
)
})?;
let value = Bytes::from(blob);
reconstruct_state.img = Some((self.lsn, value));
Ok(ValueReconstructResult::Complete)
} else {
Ok(ValueReconstructResult::Missing)
}
}
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
todo!();
}
fn delete(&self) -> Result<()> {
// delete underlying file
fs::remove_file(self.path())?;
Ok(())
}
fn is_incremental(&self) -> bool {
false
}
fn is_in_memory(&self) -> bool {
false
fn short_id(&self) -> String {
self.filename().file_name()
}
/// debugging function to print out the contents of the layer
@@ -223,6 +167,72 @@ impl Layer for ImageLayer {
Ok(())
}
/// Look up given page in the file
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
) -> anyhow::Result<ValueReconstructResult> {
assert!(self.key_range.contains(&key));
assert!(lsn_range.start >= self.lsn);
assert!(lsn_range.end >= self.lsn);
let inner = self.load()?;
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
if let Some(offset) = tree_reader.get(&keybuf)? {
let blob = file.block_cursor().read_blob(offset).with_context(|| {
format!(
"failed to read value from data file {} at offset {}",
self.path().display(),
offset
)
})?;
let value = Bytes::from(blob);
reconstruct_state.img = Some((self.lsn, value));
Ok(ValueReconstructResult::Complete)
} else {
Ok(ValueReconstructResult::Missing)
}
}
}
impl PersistentLayer for ImageLayer {
fn filename(&self) -> LayerFileName {
self.layer_name().into()
}
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
fn iter(&self) -> Result<LayerIter<'_>> {
unimplemented!();
}
fn delete(&self) -> Result<()> {
// delete underlying file
fs::remove_file(self.path())?;
Ok(())
}
fn file_size(&self) -> Option<u64> {
Some(self.file_size)
}
}
impl ImageLayer {
@@ -314,8 +324,8 @@ impl ImageLayer {
}
}
PathOrConf::Path(path) => {
let actual_filename = Path::new(path.file_name().unwrap());
let expected_filename = self.filename();
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
let expected_filename = self.filename().file_name();
if actual_filename != expected_filename {
println!(
@@ -339,6 +349,7 @@ impl ImageLayer {
timeline_id: TimelineId,
tenant_id: TenantId,
filename: &ImageFileName,
file_size: u64,
) -> ImageLayer {
ImageLayer {
path_or_conf: PathOrConf::Conf(conf),
@@ -346,6 +357,7 @@ impl ImageLayer {
tenant_id,
key_range: filename.key_range.clone(),
lsn: filename.lsn,
file_size,
inner: RwLock::new(ImageLayerInner {
loaded: false,
file: None,
@@ -358,21 +370,21 @@ impl ImageLayer {
/// Create an ImageLayer struct representing an existing file on disk.
///
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
where
F: std::os::unix::prelude::FileExt,
{
pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
let mut summary_buf = Vec::new();
summary_buf.resize(PAGE_SZ, 0);
file.read_exact_at(&mut summary_buf, 0)?;
let summary = Summary::des_prefix(&summary_buf)?;
let metadata = file
.metadata()
.context("get file metadata to determine size")?;
Ok(ImageLayer {
path_or_conf: PathOrConf::Path(path.to_path_buf()),
timeline_id: summary.timeline_id,
tenant_id: summary.tenant_id,
key_range: summary.key_range,
lsn: summary.lsn,
file_size: metadata.len(),
inner: RwLock::new(ImageLayerInner {
file: None,
loaded: false,
@@ -518,6 +530,10 @@ impl ImageLayerWriterInner {
file.seek(SeekFrom::Start(0))?;
Summary::ser_into(&summary, &mut file)?;
let metadata = file
.metadata()
.context("get metadata to determine file size")?;
// Note: Because we open the file in write-only mode, we cannot
// reuse the same VirtualFile for reading later. That's why we don't
// set inner.file here. The first read will have to re-open it.
@@ -527,6 +543,7 @@ impl ImageLayerWriterInner {
tenant_id: self.tenant_id,
key_range: self.key_range.clone(),
lsn: self.lsn,
file_size: metadata.len(),
inner: RwLock::new(ImageLayerInner {
loaded: false,
file: None,
@@ -551,7 +568,7 @@ impl ImageLayerWriterInner {
lsn: self.lsn,
},
);
std::fs::rename(self.path, &final_path)?;
std::fs::rename(self.path, final_path)?;
trace!("created image layer {}", layer.path().display());

View File

@@ -8,11 +8,10 @@ use crate::config::PageServerConf;
use crate::repository::{Key, Value};
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
use crate::tenant::block_io::BlockReader;
use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
use crate::walrecord;
use anyhow::{bail, ensure, Result};
use anyhow::{ensure, Result};
use std::cell::RefCell;
use std::collections::HashMap;
use tracing::*;
@@ -26,9 +25,10 @@ use utils::{
// while being able to use std::fmt::Write's methods
use std::fmt::Write as _;
use std::ops::Range;
use std::path::PathBuf;
use std::sync::RwLock;
use super::{DeltaLayer, DeltaLayerWriter, Layer};
thread_local! {
/// A buffer for serializing object during [`InMemoryLayer::put_value`].
/// This buffer is reused for each serialization to avoid additional malloc calls.
@@ -75,33 +75,13 @@ impl InMemoryLayerInner {
}
}
impl Layer for InMemoryLayer {
// An in-memory layer can be spilled to disk into ephemeral file,
// This function is used only for debugging, so we don't need to be very precise.
// Construct a filename as if it was a delta layer.
fn filename(&self) -> PathBuf {
let inner = self.inner.read().unwrap();
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
PathBuf::from(format!(
"inmem-{:016X}-{:016X}",
self.start_lsn.0, end_lsn.0
))
}
fn local_path(&self) -> Option<PathBuf> {
None
}
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn get_timeline_id(&self) -> TimelineId {
impl InMemoryLayer {
pub fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
}
impl Layer for InMemoryLayer {
fn get_key_range(&self) -> Range<Key> {
Key::MIN..Key::MAX
}
@@ -117,72 +97,16 @@ impl Layer for InMemoryLayer {
self.start_lsn..end_lsn
}
/// Look up given value in the layer.
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.start_lsn);
let mut need_image = true;
let inner = self.inner.read().unwrap();
let mut reader = inner.file.block_cursor();
// Scan the page versions backwards, starting from `lsn`.
if let Some(vec_map) = inner.index.get(&key) {
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
let buf = reader.read_blob(*pos)?;
let value = Value::des(&buf)?;
match value {
Value::Image(img) => {
reconstruct_state.img = Some((*entry_lsn, img));
return Ok(ValueReconstructResult::Complete);
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
reconstruct_state.records.push((*entry_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
}
// release lock on 'inner'
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(ValueReconstructResult::Continue)
} else {
Ok(ValueReconstructResult::Complete)
}
}
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
todo!();
}
/// Nothing to do here. When you drop the last reference to the layer, it will
/// be deallocated.
fn delete(&self) -> Result<()> {
bail!("can't delete an InMemoryLayer")
}
fn is_incremental(&self) -> bool {
// in-memory layer is always considered incremental.
true
}
fn is_in_memory(&self) -> bool {
true
fn short_id(&self) -> String {
let inner = self.inner.read().unwrap();
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
}
/// debugging function to print out the contents of the layer
@@ -235,6 +159,55 @@ impl Layer for InMemoryLayer {
Ok(())
}
/// Look up given value in the layer.
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.start_lsn);
let mut need_image = true;
let inner = self.inner.read().unwrap();
let mut reader = inner.file.block_cursor();
// Scan the page versions backwards, starting from `lsn`.
if let Some(vec_map) = inner.index.get(&key) {
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
let buf = reader.read_blob(*pos)?;
let value = Value::des(&buf)?;
match value {
Value::Image(img) => {
reconstruct_state.img = Some((*entry_lsn, img));
return Ok(ValueReconstructResult::Complete);
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
reconstruct_state.records.push((*entry_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
}
// release lock on 'inner'
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(ValueReconstructResult::Continue)
} else {
Ok(ValueReconstructResult::Complete)
}
}
}
impl InMemoryLayer {

View File

@@ -0,0 +1,210 @@
//! A RemoteLayer is an in-memory placeholder for a layer file that exists
//! in remote storage.
//!
use crate::config::PageServerConf;
use crate::repository::Key;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use anyhow::{bail, Result};
use std::ops::Range;
use std::path::PathBuf;
use std::sync::Arc;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
use super::image_layer::ImageLayer;
use super::{DeltaLayer, LayerIter, LayerKeyIter, PersistentLayer};
#[derive(Debug)]
pub struct RemoteLayer {
tenantid: TenantId,
timelineid: TimelineId,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
pub file_name: LayerFileName,
pub layer_metadata: LayerFileMetadata,
is_delta: bool,
is_incremental: bool,
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
}
impl Layer for RemoteLayer {
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
fn get_value_reconstruct_data(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
_reconstruct_state: &mut ValueReconstructState,
) -> Result<ValueReconstructResult> {
bail!(
"layer {} needs to be downloaded",
self.filename().file_name()
);
}
fn is_incremental(&self) -> bool {
self.is_incremental
}
/// debugging function to print out the contents of the layer
fn dump(&self, _verbose: bool) -> Result<()> {
println!(
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.tenantid,
self.timelineid,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
Ok(())
}
fn short_id(&self) -> String {
self.filename().file_name()
}
}
impl PersistentLayer for RemoteLayer {
fn get_tenant_id(&self) -> TenantId {
self.tenantid
}
fn get_timeline_id(&self) -> TimelineId {
self.timelineid
}
fn filename(&self) -> LayerFileName {
if self.is_delta {
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
.into()
} else {
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
}
.into()
}
}
fn local_path(&self) -> Option<PathBuf> {
None
}
fn iter(&self) -> Result<LayerIter<'_>> {
bail!("cannot iterate a remote layer");
}
fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
bail!("cannot iterate a remote layer");
}
fn delete(&self) -> Result<()> {
Ok(())
}
fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
Some(self)
}
fn is_remote_layer(&self) -> bool {
true
}
fn file_size(&self) -> Option<u64> {
self.layer_metadata.file_size()
}
}
impl RemoteLayer {
pub fn new_img(
tenantid: TenantId,
timelineid: TimelineId,
fname: &ImageFileName,
layer_metadata: &LayerFileMetadata,
) -> RemoteLayer {
RemoteLayer {
tenantid,
timelineid,
key_range: fname.key_range.clone(),
lsn_range: fname.lsn..(fname.lsn + 1),
is_delta: false,
is_incremental: false,
file_name: fname.to_owned().into(),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
}
}
pub fn new_delta(
tenantid: TenantId,
timelineid: TimelineId,
fname: &DeltaFileName,
layer_metadata: &LayerFileMetadata,
) -> RemoteLayer {
RemoteLayer {
tenantid,
timelineid,
key_range: fname.key_range.clone(),
lsn_range: fname.lsn_range.clone(),
is_delta: true,
is_incremental: true,
file_name: fname.to_owned().into(),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
}
}
/// Create a Layer struct representing this layer, after it has been downloaded.
pub fn create_downloaded_layer(
&self,
conf: &'static PageServerConf,
file_size: u64,
) -> Arc<dyn PersistentLayer> {
if self.is_delta {
let fname = DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
};
Arc::new(DeltaLayer::new(
conf,
self.timelineid,
self.tenantid,
&fname,
file_size,
))
} else {
let fname = ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
};
Arc::new(ImageLayer::new(
conf,
self.timelineid,
self.tenantid,
&fname,
file_size,
))
}
}
}

View File

@@ -8,8 +8,8 @@ use std::time::Duration;
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::mgr;
use crate::tenant::{Tenant, TenantState};
use crate::tenant_mgr;
use tracing::*;
use utils::id::TenantId;
@@ -127,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) {
} else {
// Run gc
if gc_horizon > 0 {
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
{
sleep_duration = wait_duration;
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
@@ -155,7 +155,7 @@ async fn wait_for_active_tenant(
wait: Duration,
) -> ControlFlow<(), Arc<Tenant>> {
let tenant = loop {
match tenant_mgr::get_tenant(tenant_id, false) {
match mgr::get_tenant(tenant_id, false).await {
Ok(tenant) => break tenant,
Err(e) => {
error!("Failed to get a tenant {tenant_id}: {e:#}");

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,213 @@
use crate::metrics::RemoteOpFileKind;
use super::storage_layer::LayerFileName;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;
use std::sync::Arc;
use tracing::info;
use std::sync::atomic::AtomicU32;
use utils::lsn::Lsn;
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
// memory for Uninitialized variants. Doesn't matter in practice, there are not
// that many upload queues in a running pageserver, and most of them are initialized
// anyway.
#[allow(clippy::large_enum_variant)]
pub(crate) enum UploadQueue {
Uninitialized,
Initialized(UploadQueueInitialized),
Stopped(UploadQueueStopped),
}
impl UploadQueue {
fn as_str(&self) -> &'static str {
match self {
UploadQueue::Uninitialized => "Uninitialized",
UploadQueue::Initialized(_) => "Initialized",
UploadQueue::Stopped(_) => "Stopped",
}
}
}
/// This keeps track of queued and in-progress tasks.
pub(crate) struct UploadQueueInitialized {
/// Counter to assign task IDs
pub(crate) task_counter: u64,
/// All layer files stored in the remote storage, taking into account all
/// in-progress and queued operations
pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
/// How many file uploads or deletions been scheduled, since the
/// last (scheduling of) metadata index upload?
pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
/// Metadata stored in the remote storage, taking into account all
/// in-progress and queued operations.
/// DANGER: do not return to outside world, e.g., safekeepers.
pub(crate) latest_metadata: TimelineMetadata,
/// `disk_consistent_lsn` from the last metadata file that was successfully
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
/// Safekeeper can rely on it to make decisions for WAL storage.
pub(crate) last_uploaded_consistent_lsn: Lsn,
// Breakdown of different kinds of tasks currently in-progress
pub(crate) num_inprogress_layer_uploads: usize,
pub(crate) num_inprogress_metadata_uploads: usize,
pub(crate) num_inprogress_deletions: usize,
/// Tasks that are currently in-progress. In-progress means that a tokio Task
/// has been launched for it. An in-progress task can be busy uploading, but it can
/// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
/// be waiting for retry in `exponential_backoff`.
pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
/// Queued operations that have not been launched yet. They might depend on previous
/// tasks to finish. For example, metadata upload cannot be performed before all
/// preceding layer file uploads have completed.
pub(crate) queued_operations: VecDeque<UploadOp>,
}
pub(crate) struct UploadQueueStopped {
pub(crate) last_uploaded_consistent_lsn: Lsn,
}
impl UploadQueue {
pub(crate) fn initialize_empty_remote(
&mut self,
metadata: &TimelineMetadata,
) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized => (),
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
anyhow::bail!("already initialized, state {}", self.as_str())
}
}
info!("initializing upload queue for empty remote");
let state = UploadQueueInitialized {
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
latest_files: HashMap::new(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: metadata.clone(),
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
// safekeepers from garbage-collecting anything.
last_uploaded_consistent_lsn: Lsn(0),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
};
*self = UploadQueue::Initialized(state);
Ok(self.initialized_mut().expect("we just set it"))
}
pub(crate) fn initialize_with_current_remote_index_part(
&mut self,
index_part: &IndexPart,
) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized => (),
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
anyhow::bail!("already initialized, state {}", self.as_str())
}
}
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
for layer_name in &index_part.timeline_layers {
let layer_metadata = index_part
.layer_metadata
.get(layer_name)
.map(LayerFileMetadata::from)
.unwrap_or(LayerFileMetadata::MISSING);
files.insert(layer_name.to_owned(), layer_metadata);
}
let index_part_metadata = index_part.parse_metadata()?;
info!(
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
index_part_metadata.disk_consistent_lsn()
);
let state = UploadQueueInitialized {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part_metadata.clone(),
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
};
*self = UploadQueue::Initialized(state);
Ok(self.initialized_mut().expect("we just set it"))
}
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
anyhow::bail!("queue is in state {}", self.as_str())
}
UploadQueue::Initialized(x) => Ok(x),
}
}
}
/// An in-progress upload or delete task.
#[derive(Debug)]
pub(crate) struct UploadTask {
/// Unique ID of this task. Used as the key in `inprogress_tasks` above.
pub(crate) task_id: u64,
pub(crate) retries: AtomicU32,
pub(crate) op: UploadOp,
}
#[derive(Debug)]
pub(crate) enum UploadOp {
/// Upload a layer file
UploadLayer(LayerFileName, LayerFileMetadata),
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
/// Delete a file.
Delete(RemoteOpFileKind, LayerFileName),
/// Barrier. When the barrier operation is reached,
Barrier(tokio::sync::watch::Sender<()>),
}
impl std::fmt::Display for UploadOp {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
UploadOp::UploadLayer(path, metadata) => {
write!(
f,
"UploadLayer({}, size={:?})",
path.file_name(),
metadata.file_size()
)
}
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}
}

View File

@@ -1,427 +0,0 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use std::collections::hash_map;
use std::ffi::OsStr;
use std::fs;
use std::path::Path;
use std::sync::Arc;
use anyhow::Context;
use tracing::*;
use remote_storage::GenericRemoteStorage;
use crate::config::PageServerConf;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::{Tenant, TenantState};
use crate::tenant_config::TenantConfOpt;
use utils::fs_ext::PathExt;
use utils::id::{TenantId, TimelineId};
mod tenants_state {
use once_cell::sync::Lazy;
use std::{
collections::HashMap,
sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard},
};
use utils::id::TenantId;
use crate::tenant::Tenant;
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
Lazy::new(|| RwLock::new(HashMap::new()));
pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
TENANTS
.read()
.expect("Failed to read() tenants lock, it got poisoned")
}
pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
TENANTS
.write()
.expect("Failed to write() tenants lock, it got poisoned")
}
}
/// Initialize repositories with locally available timelines.
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
/// are scheduled for download and added to the tenant once download is completed.
pub fn init_tenant_mgr(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<()> {
let _entered = info_span!("init_tenant_mgr").entered();
// Scan local filesystem for attached tenants
let mut number_of_tenants = 0;
let tenants_dir = conf.tenants_path();
for dir_entry in std::fs::read_dir(&tenants_dir)
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
{
match &dir_entry {
Ok(dir_entry) => {
let tenant_dir_path = dir_entry.path();
if crate::is_temporary(&tenant_dir_path) {
info!(
"Found temporary tenant directory, removing: {}",
tenant_dir_path.display()
);
if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
error!(
"Failed to remove temporary directory '{}': {:?}",
tenant_dir_path.display(),
e
);
}
} else {
match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
Ok(Some(tenant)) => {
tenants_state::write_tenants().insert(tenant.tenant_id(), tenant);
number_of_tenants += 1;
}
Ok(None) => {
// This case happens if we crash during attach before creating the attach marker file
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path.display()
)
}
}
Err(e) => {
error!(
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
tenants_dir.display(),
dir_entry,
e
);
}
}
}
}
Err(e) => {
// On error, print it, but continue with the other tenants. If we error out
// here, the pageserver startup fails altogether, causing outage for *all*
// tenants. That seems worse.
error!(
"Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
dir_entry,
tenants_dir.display(),
e,
);
}
}
}
info!("Processed {number_of_tenants} local tenants at startup");
Ok(())
}
fn load_local_tenant(
conf: &'static PageServerConf,
tenant_path: &Path,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<Option<Arc<Tenant>>> {
if !tenant_path.is_dir() {
anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
}
let is_empty = tenant_path
.is_empty_dir()
.context("check whether tenant_path is an empty dir")?;
if is_empty {
info!("skipping empty tenant directory {tenant_path:?}");
return Ok(None);
}
let tenant_id = tenant_path
.file_name()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TenantId>()
.context("Could not parse tenant id out of the tenant dir name")?;
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if let Some(remote_storage) = remote_storage {
Tenant::spawn_attach(conf, tenant_id, &remote_storage)
} else {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(conf, tenant_id)
}
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
// Start loading the tenant into memory. It will initially be in Loading state.
Tenant::spawn_load(conf, tenant_id, remote_storage)
};
Ok(Some(tenant))
}
///
/// Shut down all tenants. This runs as part of pageserver shutdown.
///
pub async fn shutdown_all_tenants() {
let tenants_to_shut_down = {
let mut m = tenants_state::write_tenants();
let mut tenants_to_shut_down = Vec::with_capacity(m.len());
for (_, tenant) in m.drain() {
if tenant.is_active() {
// updates tenant state, forbidding new GC and compaction iterations from starting
tenant.set_stopping();
tenants_to_shut_down.push(tenant)
}
}
drop(m);
tenants_to_shut_down
};
// Shut down all existing walreceiver connections and stop accepting the new ones.
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
// Ok, no background tasks running anymore. Flush any remaining data in
// memory to disk.
//
// We assume that any incoming connections that might request pages from
// the tenant have already been terminated by the caller, so there
// should be no more activity in any of the repositories.
//
// On error, log it but continue with the shutdown for other tenants.
for tenant in tenants_to_shut_down {
let tenant_id = tenant.tenant_id();
debug!("shutdown tenant {tenant_id}");
if let Err(err) = tenant.checkpoint().await {
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
}
}
}
pub fn create_tenant(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<Option<Arc<Tenant>>> {
match tenants_state::write_tenants().entry(tenant_id) {
hash_map::Entry::Occupied(_) => {
debug!("tenant {tenant_id} already exists");
Ok(None)
}
hash_map::Entry::Vacant(v) => {
// Hold the write_tenants() lock, since all of this is local IO.
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
let tenant_directory =
super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
let created_tenant = load_local_tenant(conf, &tenant_directory, remote_storage)?;
match created_tenant {
None => {
// We get None in case the directory is empty.
// This shouldn't happen here, because we just created the directory.
// So, skip any cleanup work for now, we don't know how we reached this state.
anyhow::bail!("we just created the tenant directory, it can't be empty");
}
Some(tenant) => {
anyhow::ensure!(
tenant_id == tenant.tenant_id(),
"loaded created tenant has unexpected tenant id (expect {} != actual {})",
tenant_id,
tenant.tenant_id()
);
v.insert(Arc::clone(&tenant));
Ok(Some(tenant))
}
}
}
}
}
pub fn update_tenant_config(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> anyhow::Result<()> {
info!("configuring tenant {tenant_id}");
get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf);
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
Ok(())
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
let m = tenants_state::read_tenants();
let tenant = m
.get(&tenant_id)
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
if active_only && !tenant.is_active() {
anyhow::bail!(
"Tenant {tenant_id} is not active. Current state: {:?}",
tenant.current_state()
)
} else {
Ok(Arc::clone(tenant))
}
}
pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
// Start with the shutdown of timeline tasks (this shuts down the walreceiver)
// It is important that we do not take locks here, and do not check whether the timeline exists
// because if we hold tenants_state::write_tenants() while awaiting for the tasks to join
// we cannot create new timelines and tenants, and that can take quite some time,
// it can even become stuck due to a bug making whole pageserver unavailable for some operations
// so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
// and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
// will synchronize and either fail with the not found error or succeed
debug!("waiting for wal receiver to shutdown");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(tenant_id),
Some(timeline_id),
)
.await;
debug!("wal receiver shutdown confirmed");
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
info!("timeline task shutdown completed");
match get_tenant(tenant_id, true) {
Ok(tenant) => {
tenant.delete_timeline(timeline_id).await?;
}
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
}
Ok(())
}
pub async fn detach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<()> {
let tenant = match {
let mut tenants_accessor = tenants_state::write_tenants();
tenants_accessor.remove(&tenant_id)
} {
Some(tenant) => tenant,
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
};
tenant.set_stopping();
// shutdown all tenant and timeline tasks: gc, compaction, page service)
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
// If removal fails there will be no way to successfully retry detach,
// because the tenant no longer exists in the in-memory map. And it needs to be removed from it
// before we remove files, because it contains references to tenant
// which references ephemeral files which are deleted on drop. So if we keep these references,
// we will attempt to remove files which no longer exist. This can be fixed by having shutdown
// mechanism for tenant that will clean temporary data to avoid any references to ephemeral files
let local_tenant_directory = conf.tenant_path(&tenant_id);
fs::remove_dir_all(&local_tenant_directory).with_context(|| {
format!(
"Failed to remove local tenant directory '{}'",
local_tenant_directory.display()
)
})?;
Ok(())
}
///
/// Get list of tenants, for the mgmt API
///
pub fn list_tenants() -> Vec<(TenantId, TenantState)> {
tenants_state::read_tenants()
.iter()
.map(|(id, tenant)| (*id, tenant.current_state()))
.collect()
}
/// Execute Attach mgmt API command.
///
/// Downloading all the tenant data is performed in the background, this merely
/// spawns the background task and returns quickly.
pub async fn attach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
remote_storage: &GenericRemoteStorage,
) -> anyhow::Result<()> {
match tenants_state::write_tenants().entry(tenant_id) {
hash_map::Entry::Occupied(e) => {
// Cannot attach a tenant that already exists. The error message depends on
// the state it's in.
match e.get().current_state() {
TenantState::Attaching => {
anyhow::bail!("tenant {tenant_id} attach is already in progress")
}
current_state => {
anyhow::bail!("tenant already exists, current state: {current_state:?}")
}
}
}
hash_map::Entry::Vacant(v) => {
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
v.insert(tenant);
Ok(())
}
}
}
#[cfg(feature = "testing")]
use {
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
utils::http::error::ApiError,
};
#[cfg(feature = "testing")]
pub fn immediate_gc(
tenant_id: TenantId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
let guard = tenants_state::read_tenants();
let tenant = guard
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("Tenant {tenant_id} not found"))
.map_err(ApiError::NotFound)?;
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
// Run in task_mgr to avoid race with detach operation
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::GarbageCollector,
Some(tenant_id),
Some(timeline_id),
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
false,
async move {
fail::fail_point!("immediate_gc_task_pre");
let result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
match task_done.send(result) {
Ok(_) => (),
Err(result) => error!("failed to send gc result: {result:?}"),
}
Ok(())
}
);
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
drop(guard);
Ok(wait_task_done)
}

View File

@@ -12,7 +12,7 @@
//!
use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
use once_cell::sync::OnceCell;
use std::fs::{File, OpenOptions};
use std::fs::{self, File, OpenOptions};
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
@@ -240,6 +240,10 @@ impl VirtualFile {
self.with_file("fsync", |file| file.sync_all())?
}
pub fn metadata(&self) -> Result<fs::Metadata, Error> {
self.with_file("metadata", |file| file.metadata())?
}
/// Helper function that looks up the underlying File for this VirtualFile,
/// opening it and evicting some other File if necessary. It calls 'func'
/// with the physical File.

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@
//! hence WAL receiver needs to react on such events.
//!
//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically.
//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
//! Without this data, no WAL streaming is possible currently.
//!
@@ -26,57 +26,52 @@ mod walreceiver_connection;
use crate::config::PageServerConf;
use crate::task_mgr::WALRECEIVER_RUNTIME;
use anyhow::{ensure, Context};
use etcd_broker::Client;
use itertools::Itertools;
use anyhow::Context;
use once_cell::sync::OnceCell;
use std::future::Future;
use storage_broker::BrokerClientChannel;
use tokio::sync::watch;
use tracing::*;
use url::Url;
pub use connection_manager::spawn_connection_manager_task;
static ETCD_CLIENT: OnceCell<Client> = OnceCell::new();
static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
///
/// Initialize the etcd client. This must be called once at page server startup.
/// Initialize the broker client. This must be called once at page server startup.
///
pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
let etcd_endpoints = conf.broker_endpoints.clone();
ensure!(
!etcd_endpoints.is_empty(),
"Cannot start wal receiver: etcd endpoints are empty"
);
pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
let broker_endpoint = conf.broker_endpoint.clone();
let etcd_client = Client::connect(etcd_endpoints.clone(), None)
.await
.context("Failed to connect to etcd")?;
// Note: we do not attempt connecting here (but validate endpoints sanity).
let broker_client =
storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
format!(
"Failed to create broker client to {}",
&conf.broker_endpoint
),
)?;
// FIXME: Should we still allow the pageserver to start, if etcd
// doesn't work? It could still serve GetPage requests, with the
// data it has locally and from what it can download from remote
// storage
if ETCD_CLIENT.set(etcd_client).is_err() {
panic!("etcd already initialized");
if BROKER_CLIENT.set(broker_client).is_err() {
panic!("broker already initialized");
}
info!(
"Initialized etcd client with endpoints: {}",
etcd_endpoints.iter().map(Url::to_string).join(", ")
"Initialized broker client with endpoints: {}",
broker_endpoint
);
Ok(())
}
///
/// Get a handle to the etcd client
/// Get a handle to the broker client
///
pub fn get_etcd_client() -> &'static etcd_broker::Client {
ETCD_CLIENT.get().expect("etcd client not initialized")
pub fn get_broker_client() -> &'static BrokerClientChannel {
BROKER_CLIENT.get().expect("broker client not initialized")
}
pub fn is_etcd_client_initialized() -> bool {
ETCD_CLIENT.get().is_some()
pub fn is_broker_client_initialized() -> bool {
BROKER_CLIENT.get().is_some()
}
/// A handle of an asynchronous task.
@@ -134,15 +129,21 @@ impl<E: Clone> TaskHandle<E> {
match self.events_receiver.changed().await {
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
Err(_task_channel_part_dropped) => {
TaskEvent::End(match self.join_handle.take() {
TaskEvent::End(match self.join_handle.as_mut() {
Some(jh) => {
if !jh.is_finished() {
warn!("sender is dropped while join handle is still alive");
}
jh.await
let res = jh
.await
.map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
.and_then(|x| x)
.and_then(|x| x);
// For cancellation-safety, drop join_handle only after successful .await.
self.join_handle = None;
res
}
None => {
// Another option is to have an enum, join handle or result and give away the reference to it

View File

@@ -1,21 +1,15 @@
//! WAL receiver logic that ensures the pageserver gets connectected to safekeeper,
//! that contains the latest WAL to stream and this connection does not go stale.
//!
//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it,
//! To achieve that, a storage broker is used: safekepers propagate their timelines' state in it,
//! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection.
//! Current connection state is tracked too, to ensure it's not getting stale.
//!
//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
//! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
//! then a [re]connection happens, if necessary.
//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel.
//! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
use std::{
collections::{hash_map, HashMap},
num::NonZeroU64,
ops::ControlFlow,
sync::Arc,
time::Duration,
};
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
use crate::task_mgr::TaskKind;
use crate::task_mgr::WALRECEIVER_RUNTIME;
@@ -23,16 +17,18 @@ use crate::tenant::Timeline;
use crate::{task_mgr, walreceiver::TaskStateUpdate};
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
use etcd_broker::{
subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
BrokerUpdate, Client,
};
use pageserver_api::models::TimelineState;
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use storage_broker::BrokerClientChannel;
use storage_broker::Streaming;
use tokio::{select, sync::watch};
use tracing::*;
use crate::{
exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS,
exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
};
use postgres_connection::{parse_host_port, PgConnectionConfig};
@@ -45,14 +41,13 @@ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
/// Spawns the loop to take care of the timeline's WAL streaming connection.
pub fn spawn_connection_manager_task(
broker_loop_prefix: String,
timeline: Arc<Timeline>,
wal_connect_timeout: Duration,
lagging_wal_timeout: Duration,
max_lsn_wal_lag: NonZeroU64,
auth_token: Option<Arc<String>>,
) {
let mut etcd_client = get_etcd_client().clone();
let mut broker_client = get_broker_client().clone();
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
@@ -65,7 +60,7 @@ pub fn spawn_connection_manager_task(
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
false,
async move {
info!("WAL receiver broker started, connecting to etcd");
info!("WAL receiver manager started, connecting to broker");
let mut walreceiver_state = WalreceiverState::new(
timeline,
wal_connect_timeout,
@@ -81,8 +76,7 @@ pub fn spawn_connection_manager_task(
return Ok(());
},
loop_step_result = connection_manager_loop_step(
&broker_loop_prefix,
&mut etcd_client,
&mut broker_client,
&mut walreceiver_state,
) => match loop_step_result {
ControlFlow::Continue(()) => continue,
@@ -103,10 +97,9 @@ pub fn spawn_connection_manager_task(
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
/// If etcd subscription is cancelled, exits.
/// If storage broker subscription is cancelled, exits.
async fn connection_manager_loop_step(
broker_prefix: &str,
etcd_client: &mut Client,
broker_client: &mut BrokerClientChannel,
walreceiver_state: &mut WalreceiverState,
) -> ControlFlow<(), ()> {
let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
@@ -124,13 +117,11 @@ async fn connection_manager_loop_step(
timeline_id: walreceiver_state.timeline.timeline_id,
};
// XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go,
// running the entire loop step as much as possible to an end.
// The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end,
// forcing the etcd subscription to exit either way.
let mut broker_subscription =
subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await;
info!("Subscribed for etcd timeline changes, waiting for new etcd data");
// Subscribe to the broker updates. Stream shares underlying TCP connection
// with other streams on this client (other connection managers). When
// object goes out of scope, stream finishes in drop() automatically.
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
info!("Subscribed for broker timeline updates");
loop {
let time_until_next_retry = walreceiver_state.time_until_next_retry();
@@ -145,12 +136,6 @@ async fn connection_manager_loop_step(
// - this might change the current desired connection
// - timeline state changes to something that does not allow walreceiver to run concurrently
select! {
broker_connection_result = &mut broker_subscription.watcher_handle => {
info!("Broker connection was closed from the other side, ending current broker loop step");
cleanup_broker_connection(broker_connection_result, walreceiver_state);
return ControlFlow::Continue(());
},
Some(wal_connection_update) = async {
match walreceiver_state.wal_connection.as_mut() {
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
@@ -160,21 +145,17 @@ async fn connection_manager_loop_step(
let wal_connection = walreceiver_state.wal_connection.as_mut()
.expect("Should have a connection, as checked by the corresponding select! guard");
match wal_connection_update {
TaskEvent::Update(c) => {
match c {
TaskStateUpdate::Init | TaskStateUpdate::Started => {},
TaskStateUpdate::Progress(status) => {
if status.has_processed_wal {
// We have advanced last_record_lsn by processing the WAL received
// from this safekeeper. This is good enough to clean unsuccessful
// retries history and allow reconnecting to this safekeeper without
// sleeping for a long time.
walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
}
wal_connection.status = status.to_owned();
}
TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
if new_status.has_processed_wal {
// We have advanced last_record_lsn by processing the WAL received
// from this safekeeper. This is good enough to clean unsuccessful
// retries history and allow reconnecting to this safekeeper without
// sleeping for a long time.
walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
}
},
wal_connection.status = new_status;
}
TaskEvent::End(walreceiver_task_result) => {
match walreceiver_task_result {
Ok(()) => debug!("WAL receiving task finished"),
@@ -185,22 +166,16 @@ async fn connection_manager_loop_step(
}
},
// Got a new update from etcd
broker_update = broker_subscription.value_updates.recv() => {
// Got a new update from the broker
broker_update = broker_subscription.message() => {
match broker_update {
Some(broker_update) => walreceiver_state.register_timeline_update(broker_update),
None => {
info!("Broker sender end was dropped, ending current broker loop step");
// Ensure to cancel and wait for the broker subscription task end, to log its result.
// Broker sender end is in the broker subscription task and its drop means abnormal task completion.
// First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times).
broker_subscription.watcher_handle.abort();
// Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case),
// a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway.
cleanup_broker_connection(
(&mut broker_subscription.watcher_handle).await,
walreceiver_state,
);
Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update),
Err(e) => {
error!("broker subscription failed: {e}");
return ControlFlow::Continue(());
}
Ok(None) => {
error!("broker subscription stream ended"); // can't happen
return ControlFlow::Continue(());
}
}
@@ -231,18 +206,18 @@ async fn connection_manager_loop_step(
}
},
_ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
}
// Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly.
let mut max_events_to_poll = 100_u32;
while max_events_to_poll > 0 {
if let Ok(broker_update) = broker_subscription.value_updates.try_recv() {
walreceiver_state.register_timeline_update(broker_update);
max_events_to_poll -= 1;
} else {
break;
}
Some(()) = async {
match time_until_next_retry {
Some(sleep_time) => {
tokio::time::sleep(sleep_time).await;
Some(())
},
None => {
debug!("No candidates to retry, waiting indefinitely for the broker events");
None
}
}
} => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
}
if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
@@ -285,33 +260,11 @@ async fn wait_for_active_timeline(
}
}
fn cleanup_broker_connection(
broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
walreceiver_state: &mut WalreceiverState,
) {
match broker_connection_result {
Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"),
Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"),
Err(abort_error) => {
if abort_error.is_panic() {
error!("Broker connection panicked: {abort_error}")
} else {
debug!("Broker connection aborted: {abort_error}")
}
}
}
walreceiver_state.wal_stream_candidates.clear();
}
/// Endlessly try to subscribe for broker updates for a given timeline.
/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly.
/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway.
async fn subscribe_for_timeline_updates(
etcd_client: &mut Client,
broker_prefix: &str,
broker_client: &mut BrokerClientChannel,
id: TenantTimelineId,
) -> BrokerSubscription<SkTimelineInfo> {
) -> Streaming<SafekeeperTimelineInfo> {
let mut attempt = 0;
loop {
exponential_backoff(
@@ -322,18 +275,21 @@ async fn subscribe_for_timeline_updates(
.await;
attempt += 1;
match etcd_broker::subscribe_for_json_values(
etcd_client,
SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id),
)
.instrument(info_span!("etcd_subscription"))
.await
{
Ok(new_subscription) => {
return new_subscription;
// subscribe to the specific timeline
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
});
let request = SubscribeSafekeeperInfoRequest {
subscription_key: Some(key),
};
match broker_client.subscribe_safekeeper_info(request).await {
Ok(resp) => {
return resp.into_inner();
}
Err(e) => {
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}");
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
continue;
}
}
@@ -360,8 +316,8 @@ struct WalreceiverState {
wal_connection: Option<WalConnection>,
/// Info about retries and unsuccessful attempts to connect to safekeepers.
wal_connection_retries: HashMap<NodeId, RetryInfo>,
/// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id.
wal_stream_candidates: HashMap<NodeId, EtcdSkTimeline>,
/// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id.
wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
auth_token: Option<Arc<String>>,
}
@@ -395,13 +351,11 @@ struct RetryInfo {
retry_duration_seconds: f64,
}
/// Data about the timeline to connect to, received from etcd.
/// Data about the timeline to connect to, received from the broker.
#[derive(Debug)]
struct EtcdSkTimeline {
timeline: SkTimelineInfo,
/// Etcd generation, the bigger it is, the more up to date the timeline data is.
etcd_version: i64,
/// Time at which the data was fetched from etcd last time, to track the stale data.
struct BrokerSkTimeline {
timeline: SafekeeperTimelineInfo,
/// Time at which the data was fetched from the broker last time, to track the stale data.
latest_update: NaiveDateTime,
}
@@ -453,7 +407,7 @@ impl WalreceiverState {
.await
.context("walreceiver connection handling failure")
}
.instrument(info_span!("walreceiver_connection", id = %id))
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
});
let now = Utc::now().naive_utc();
@@ -533,36 +487,28 @@ impl WalreceiverState {
.values()
.filter_map(|retry| retry.next_retry_at)
.filter(|next_retry_at| next_retry_at > &now)
.min();
.min()?;
next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok())
(next_retry_at - now).to_std().ok()
}
/// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key.
fn register_timeline_update(&mut self, timeline_update: BrokerUpdate<SkTimelineInfo>) {
match self
.wal_stream_candidates
.entry(timeline_update.key.node_id)
{
hash_map::Entry::Occupied(mut o) => {
let existing_value = o.get_mut();
if existing_value.etcd_version < timeline_update.etcd_version {
existing_value.etcd_version = timeline_update.etcd_version;
existing_value.timeline = timeline_update.value;
existing_value.latest_update = Utc::now().naive_utc();
}
}
hash_map::Entry::Vacant(v) => {
v.insert(EtcdSkTimeline {
timeline: timeline_update.value,
etcd_version: timeline_update.etcd_version,
latest_update: Utc::now().naive_utc(),
});
}
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
let old_entry = self.wal_stream_candidates.insert(
new_safekeeper_id,
BrokerSkTimeline {
timeline: timeline_update,
latest_update: Utc::now().naive_utc(),
},
);
if old_entry.is_none() {
info!("New SK node was added: {new_safekeeper_id}");
}
}
/// Cleans up stale etcd records and checks the rest for the new connection candidate.
/// Cleans up stale broker records and checks the rest for the new connection candidate.
/// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise.
/// The current rules for approving new candidates:
/// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps
@@ -585,7 +531,7 @@ impl WalreceiverState {
Some(existing_wal_connection) => {
let connected_sk_node = existing_wal_connection.sk_id;
let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connconf) =
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
self.select_connection_candidate(Some(connected_sk_node))?;
let now = Utc::now().naive_utc();
@@ -614,7 +560,7 @@ impl WalreceiverState {
}
if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn {
let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
let new_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
// Check if the new candidate has much more WAL than the current one.
match new_commit_lsn.0.checked_sub(current_commit_lsn.0) {
Some(new_sk_lsn_advantage) => {
@@ -644,7 +590,7 @@ impl WalreceiverState {
.status
.commit_lsn
.unwrap_or(current_lsn);
let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
let candidate_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
// Keep discovered_new_wal only if connected safekeeper has not caught up yet.
let mut discovered_new_wal = existing_wal_connection
@@ -727,7 +673,7 @@ impl WalreceiverState {
None
}
/// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers.
/// Selects the best possible candidate, based on the data collected from the broker updates about the safekeepers.
/// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
///
/// The candidate that is chosen:
@@ -736,7 +682,7 @@ impl WalreceiverState {
fn select_connection_candidate(
&self,
node_to_omit: Option<NodeId>,
) -> Option<(NodeId, &SkTimelineInfo, PgConnectionConfig)> {
) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
self.applicable_connection_candidates()
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
.max_by_key(|(_, info, _)| info.commit_lsn)
@@ -746,12 +692,12 @@ impl WalreceiverState {
/// Some safekeepers are filtered by the retry cooldown.
fn applicable_connection_candidates(
&self,
) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, PgConnectionConfig)> {
) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
let now = Utc::now().naive_utc();
self.wal_stream_candidates
.iter()
.filter(|(_, info)| info.timeline.commit_lsn.is_some())
.filter(|(_, info)| Lsn(info.timeline.commit_lsn) != Lsn::INVALID)
.filter(move |(sk_id, _)| {
let next_retry_at = self
.wal_connection_retries
@@ -761,12 +707,14 @@ impl WalreceiverState {
});
next_retry_at.is_none() || next_retry_at.unwrap() <= now
})
.filter_map(|(sk_id, etcd_info)| {
let info = &etcd_info.timeline;
}).filter_map(|(sk_id, broker_info)| {
let info = &broker_info.timeline;
if info.safekeeper_connstr.is_empty() {
return None; // no connection string, ignore sk
}
match wal_stream_connection_config(
self.id,
info.safekeeper_connstr.as_deref()?,
info.safekeeper_connstr.as_ref(),
match &self.auth_token {
None => None,
Some(x) => Some(x),
@@ -781,15 +729,16 @@ impl WalreceiverState {
})
}
/// Remove candidates which haven't sent etcd updates for a while.
/// Remove candidates which haven't sent broker updates for a while.
fn cleanup_old_candidates(&mut self) {
let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
let lagging_wal_timeout = self.lagging_wal_timeout;
self.wal_stream_candidates.retain(|node_id, etcd_info| {
if let Ok(time_since_latest_etcd_update) =
(Utc::now().naive_utc() - etcd_info.latest_update).to_std()
self.wal_stream_candidates.retain(|node_id, broker_info| {
if let Ok(time_since_latest_broker_update) =
(Utc::now().naive_utc() - broker_info.latest_update).to_std()
{
let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout;
let should_retain = time_since_latest_broker_update < lagging_wal_timeout;
if !should_retain {
node_ids_to_remove.push(*node_id);
}
@@ -799,8 +748,11 @@ impl WalreceiverState {
}
});
for node_id in node_ids_to_remove {
self.wal_connection_retries.remove(&node_id);
if !node_ids_to_remove.is_empty() {
for node_id in node_ids_to_remove {
info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections");
self.wal_connection_retries.remove(&node_id);
}
}
}
@@ -853,7 +805,7 @@ fn wal_stream_connection_config(
auth_token: Option<&str>,
) -> anyhow::Result<PgConnectionConfig> {
let (host, port) =
parse_host_port(&listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
let port = port.unwrap_or(5432);
Ok(PgConnectionConfig::new_host_port(host, port)
.extend_options([
@@ -870,6 +822,28 @@ mod tests {
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
use url::Host;
fn dummy_broker_sk_timeline(
commit_lsn: u64,
safekeeper_connstr: &str,
latest_update: NaiveDateTime,
) -> BrokerSkTimeline {
BrokerSkTimeline {
timeline: SafekeeperTimelineInfo {
safekeeper_id: 0,
tenant_timeline_id: None,
last_log_term: 0,
flush_lsn: 0,
commit_lsn,
backup_lsn: 0,
remote_consistent_lsn: 0,
peer_horizon_lsn: 0,
local_start_lsn: 0,
safekeeper_connstr: safekeeper_connstr.to_owned(),
},
latest_update,
}
}
#[tokio::test]
async fn no_connection_no_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("no_connection_no_candidate")?;
@@ -881,74 +855,16 @@ mod tests {
state.wal_connection = None;
state.wal_stream_candidates = HashMap::from([
(
NodeId(0),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(1)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: None,
},
etcd_version: 0,
latest_update: now,
},
),
(
NodeId(1),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: None,
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("no_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
),
(
NodeId(2),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: None,
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("no_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
),
(NodeId(0), dummy_broker_sk_timeline(1, "", now)),
(NodeId(1), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
(NodeId(2), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
(
NodeId(3),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: None,
},
etcd_version: 0,
latest_update: delay_over_threshold,
},
dummy_broker_sk_timeline(
1 + state.max_lsn_wal_lag.get(),
"delay_over_threshold",
delay_over_threshold,
),
),
]);
@@ -983,10 +899,10 @@ mod tests {
state.wal_connection = Some(WalConnection {
started_at: now,
sk_id: connected_sk_id,
status: connection_status.clone(),
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status.clone()))
.send(TaskStateUpdate::Progress(connection_status))
.ok();
Ok(())
}),
@@ -995,57 +911,23 @@ mod tests {
state.wal_stream_candidates = HashMap::from([
(
connected_sk_id,
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(
current_lsn + state.max_lsn_wal_lag.get() * 2,
DUMMY_SAFEKEEPER_HOST,
now,
),
),
(
NodeId(1),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(current_lsn)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("not_advanced_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(current_lsn, "not_advanced_lsn", now),
),
(
NodeId(2),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(
current_lsn + state.max_lsn_wal_lag.get() / 2,
"not_enough_advanced_lsn",
now,
),
),
]);
@@ -1067,21 +949,7 @@ mod tests {
state.wal_connection = None;
state.wal_stream_candidates = HashMap::from([(
NodeId(0),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now),
)]);
let only_candidate = state
@@ -1102,57 +970,15 @@ mod tests {
state.wal_stream_candidates = HashMap::from([
(
NodeId(0),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(selected_lsn - 100)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(selected_lsn - 100, "smaller_commit_lsn", now),
),
(
NodeId(1),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(selected_lsn)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(selected_lsn, DUMMY_SAFEKEEPER_HOST, now),
),
(
NodeId(2),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(selected_lsn + 100)),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: None,
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(selected_lsn + 100, "", now),
),
]);
let biggest_wal_candidate = state.next_connection_candidate().expect(
@@ -1186,39 +1012,11 @@ mod tests {
state.wal_stream_candidates = HashMap::from([
(
NodeId(0),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(bigger_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(bigger_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
),
(
NodeId(1),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(current_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
),
]);
state.wal_connection_retries = HashMap::from([(
@@ -1263,10 +1061,10 @@ mod tests {
state.wal_connection = Some(WalConnection {
started_at: now,
sk_id: connected_sk_id,
status: connection_status.clone(),
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status.clone()))
.send(TaskStateUpdate::Progress(connection_status))
.ok();
Ok(())
}),
@@ -1275,39 +1073,11 @@ mod tests {
state.wal_stream_candidates = HashMap::from([
(
connected_sk_id,
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(current_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
),
(
NodeId(1),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(new_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(new_lsn.0, "advanced_by_lsn_safekeeper", now),
),
]);
@@ -1356,10 +1126,10 @@ mod tests {
state.wal_connection = Some(WalConnection {
started_at: now,
sk_id: NodeId(1),
status: connection_status.clone(),
status: connection_status,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskStateUpdate::Progress(connection_status.clone()))
.send(TaskStateUpdate::Progress(connection_status))
.ok();
Ok(())
}),
@@ -1367,21 +1137,7 @@ mod tests {
});
state.wal_stream_candidates = HashMap::from([(
NodeId(0),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(current_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
)]);
let over_threshcurrent_candidate = state.next_connection_candidate().expect(
@@ -1441,21 +1197,7 @@ mod tests {
});
state.wal_stream_candidates = HashMap::from([(
NodeId(0),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(new_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
},
etcd_version: 0,
latest_update: now,
},
dummy_broker_sk_timeline(new_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
)]);
let over_threshcurrent_candidate = state.next_connection_candidate().expect(

View File

@@ -20,7 +20,9 @@ use tokio::{pin, select, sync::watch, time};
use tokio_postgres::{replication::ReplicationStream, Client};
use tracing::{debug, error, info, trace, warn};
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
use crate::{
metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate,
};
use crate::{
task_mgr,
task_mgr::TaskKind,
@@ -35,7 +37,7 @@ use pq_proto::ReplicationFeedback;
use utils::lsn::Lsn;
/// Status of the connection.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Copy)]
pub struct WalConnectionStatus {
/// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
pub is_connected: bool,
@@ -83,7 +85,7 @@ pub async fn handle_walreceiver_connection(
streaming_lsn: None,
commit_lsn: None,
};
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
return Ok(());
}
@@ -135,7 +137,7 @@ pub async fn handle_walreceiver_connection(
connection_status.latest_connection_update = Utc::now().naive_utc();
connection_status.latest_wal_update = Utc::now().naive_utc();
connection_status.commit_lsn = Some(end_of_wal);
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
return Ok(());
}
@@ -173,7 +175,8 @@ pub async fn handle_walreceiver_connection(
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?;
let mut walingest =
with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?;
while let Some(replication_message) = {
select! {
@@ -184,7 +187,20 @@ pub async fn handle_walreceiver_connection(
replication_message = physical_stream.next() => replication_message,
}
} {
let replication_message = replication_message?;
let replication_message = match replication_message {
Ok(message) => message,
Err(replication_error) => {
if replication_error.is_closed() {
info!("Replication stream got closed");
return Ok(());
} else {
return Err(
anyhow::Error::new(replication_error).context("replication stream error")
);
}
}
};
let now = Utc::now().naive_utc();
let last_rec_lsn_before_msg = last_rec_lsn;
@@ -207,7 +223,7 @@ pub async fn handle_walreceiver_connection(
}
&_ => {}
};
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
warn!("Wal connection event listener dropped, aborting the connection: {e}");
return Ok(());
}
@@ -235,9 +251,16 @@ pub async fn handle_walreceiver_connection(
// at risk of hitting a deadlock.
ensure!(lsn.is_aligned());
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
.context("could not ingest record at {lsn}")?;
with_ondemand_download(|| {
walingest.ingest_record(
recdata.clone(),
lsn,
&mut modification,
&mut decoded,
)
})
.await
.with_context(|| format!("could not ingest record at {lsn}"))?;
fail_point!("walreceiver-after-ingest");
@@ -273,8 +296,7 @@ pub async fn handle_walreceiver_connection(
if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg {
// We have successfully processed at least one WAL record.
connection_status.has_processed_wal = true;
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone()))
{
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
warn!("Wal connection event listener dropped, aborting the connection: {e}");
return Ok(());
}
@@ -313,10 +335,11 @@ pub async fn handle_walreceiver_connection(
// Send the replication feedback message.
// Regular standby_status_update fields are put into this message.
let (timeline_logical_size, _) = timeline
.get_current_logical_size()
.context("Status update creation failed to get current logical size")?;
let status_update = ReplicationFeedback {
current_timeline_size: timeline
.get_current_logical_size()
.context("Status update creation failed to get current logical size")?,
current_timeline_size: timeline_logical_size,
ps_writelsn: write_lsn,
ps_flushlsn: flush_lsn,
ps_applylsn: apply_lsn,

View File

@@ -1,6 +1,7 @@
//!
//! Functions for parsing WAL records.
//!
use anyhow::Result;
use bytes::{Buf, Bytes};
use postgres_ffi::pg_constants;

View File

@@ -84,7 +84,7 @@ pub trait WalRedoManager: Send + Sync {
&self,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
base_img: Option<(Lsn, Bytes)>,
records: Vec<(Lsn, NeonWalRecord)>,
pg_version: u32,
) -> Result<Bytes, WalRedoError>;
@@ -147,7 +147,7 @@ impl WalRedoManager for PostgresRedoManager {
&self,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
base_img: Option<(Lsn, Bytes)>,
records: Vec<(Lsn, NeonWalRecord)>,
pg_version: u32,
) -> Result<Bytes, WalRedoError> {
@@ -156,7 +156,8 @@ impl WalRedoManager for PostgresRedoManager {
return Err(WalRedoError::InvalidRequest);
}
let mut img: Option<Bytes> = base_img;
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
let mut img = base_img.map(|p| p.1);
let mut batch_neon = can_apply_in_neon(&records[0].1);
let mut batch_start = 0;
for i in 1..records.len() {
@@ -170,6 +171,7 @@ impl WalRedoManager for PostgresRedoManager {
key,
lsn,
img,
base_img_lsn,
&records[batch_start..i],
self.conf.wal_redo_timeout,
pg_version,
@@ -189,6 +191,7 @@ impl WalRedoManager for PostgresRedoManager {
key,
lsn,
img,
base_img_lsn,
&records[batch_start..],
self.conf.wal_redo_timeout,
pg_version,
@@ -223,11 +226,13 @@ impl PostgresRedoManager {
///
/// Process one request for WAL redo using wal-redo postgres
///
#[allow(clippy::too_many_arguments)]
fn apply_batch_postgres(
&self,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
base_img_lsn: Lsn,
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
pg_version: u32,
@@ -282,9 +287,12 @@ impl PostgresRedoManager {
// next request will launch a new one.
if result.is_err() {
error!(
"error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
nbytes,
base_img_lsn,
lsn
);
let process = process_guard.take().unwrap();
@@ -401,7 +409,7 @@ impl PostgresRedoManager {
key
);
for &xid in xids {
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -451,7 +459,7 @@ impl PostgresRedoManager {
key
);
for &xid in xids {
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -639,7 +647,7 @@ impl PostgresRedoProcess {
info!("running initdb in {}", datadir.display());
let initdb = Command::new(pg_bin_dir_path.join("initdb"))
.args(&["-D", &datadir.to_string_lossy()])
.args(["-D", &datadir.to_string_lossy()])
.arg("-N")
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
@@ -922,8 +930,7 @@ impl NoLeakChild {
match child.wait() {
Ok(exit_status) => {
// log at error level since .kill() is something we only do on errors ATM
error!(exit_status = %exit_status, "wait successful");
info!(exit_status = %exit_status, "wait successful");
}
Err(e) => {
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");