mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 18:40:38 +00:00
Merge branch 'main' into immutable_bst_layer_map
This commit is contained in:
@@ -22,7 +22,8 @@ use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
|
||||
use crate::tenant::Timeline;
|
||||
use crate::task_mgr;
|
||||
use crate::tenant::{with_ondemand_download, PageReconstructResult, Timeline};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
@@ -130,7 +131,7 @@ where
|
||||
|
||||
// Create pgdata subdirs structure
|
||||
for dir in PGDATA_SUBDIRS.iter() {
|
||||
let header = new_tar_header_dir(*dir)?;
|
||||
let header = new_tar_header_dir(dir)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
}
|
||||
|
||||
@@ -152,23 +153,29 @@ where
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
|
||||
for segno in
|
||||
with_ondemand_download_sync(|| self.timeline.list_slru_segments(kind, self.lsn))?
|
||||
{
|
||||
self.add_slru_segment(kind, segno)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
with_ondemand_download_sync(|| self.timeline.list_dbdirs(self.lsn))?
|
||||
{
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
|
||||
|
||||
// Gather and send relational files in each database if full backup is requested.
|
||||
if self.full_backup {
|
||||
for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn)? {
|
||||
for rel in with_ondemand_download_sync(|| {
|
||||
self.timeline.list_rels(spcnode, dbnode, self.lsn)
|
||||
})? {
|
||||
self.add_rel(rel)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
for xid in self.timeline.list_twophase_files(self.lsn)? {
|
||||
for xid in with_ondemand_download_sync(|| self.timeline.list_twophase_files(self.lsn))? {
|
||||
self.add_twophase_file(xid)?;
|
||||
}
|
||||
|
||||
@@ -185,7 +192,8 @@ where
|
||||
}
|
||||
|
||||
fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
|
||||
let nblocks = self.timeline.get_rel_size(tag, self.lsn, false)?;
|
||||
let nblocks =
|
||||
with_ondemand_download_sync(|| self.timeline.get_rel_size(tag, self.lsn, false))?;
|
||||
|
||||
// Function that adds relation segment data to archive
|
||||
let mut add_file = |segment_index, data: &Vec<u8>| -> anyhow::Result<()> {
|
||||
@@ -208,7 +216,8 @@ where
|
||||
for blknum in blocks {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)?;
|
||||
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)
|
||||
.no_ondemand_download()?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
|
||||
@@ -222,13 +231,16 @@ where
|
||||
// Generate SLRU segment files from repository.
|
||||
//
|
||||
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
|
||||
let nblocks = with_ondemand_download_sync(|| {
|
||||
self.timeline.get_slru_segment_size(slru, segno, self.lsn)
|
||||
})?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
|
||||
let img = with_ondemand_download_sync(|| {
|
||||
self.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
|
||||
})?;
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
|
||||
@@ -260,7 +272,9 @@ where
|
||||
has_relmap_file: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
|
||||
let img = with_ondemand_download_sync(|| {
|
||||
self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)
|
||||
})?;
|
||||
ensure!(img.len() == 512);
|
||||
Some(img)
|
||||
} else {
|
||||
@@ -295,7 +309,8 @@ where
|
||||
if !has_relmap_file
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn)?
|
||||
.list_rels(spcnode, dbnode, self.lsn)
|
||||
.no_ondemand_download()?
|
||||
.is_empty()
|
||||
{
|
||||
return Ok(());
|
||||
@@ -327,7 +342,7 @@ where
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_twophase_file(xid, self.lsn)?;
|
||||
let img = with_ondemand_download_sync(|| self.timeline.get_twophase_file(xid, self.lsn))?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
@@ -361,14 +376,12 @@ where
|
||||
zenith_signal.as_bytes(),
|
||||
)?;
|
||||
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_checkpoint(self.lsn)
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_control_file(self.lsn)
|
||||
.context("failed get control bytes")?;
|
||||
let checkpoint_bytes =
|
||||
with_ondemand_download_sync(|| self.timeline.get_checkpoint(self.lsn))
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes =
|
||||
with_ondemand_download_sync(|| self.timeline.get_control_file(self.lsn))
|
||||
.context("failed get control bytes")?;
|
||||
|
||||
let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
|
||||
&pg_control_bytes,
|
||||
@@ -490,3 +503,11 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ondemand_download_sync<F, T>(f: F) -> anyhow::Result<T>
|
||||
where
|
||||
F: Send + Fn() -> PageReconstructResult<T>,
|
||||
T: Send,
|
||||
{
|
||||
task_mgr::COMPUTE_REQUEST_RUNTIME.block_on(with_ondemand_download(f))
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@
|
||||
//!
|
||||
//! Example use:
|
||||
//! ```
|
||||
//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
|
||||
//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
|
||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
|
||||
//! $ firefox out.svg
|
||||
//! ```
|
||||
//!
|
||||
@@ -25,6 +25,8 @@ use anyhow::Result;
|
||||
use pageserver::repository::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet},
|
||||
ops::Range,
|
||||
@@ -65,7 +67,11 @@ fn main() -> Result<()> {
|
||||
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
|
||||
let stdin = io::stdin();
|
||||
for line in stdin.lock().lines() {
|
||||
let range = parse_filename(&line.unwrap());
|
||||
let line = line.unwrap();
|
||||
let line = PathBuf::from_str(&line).unwrap();
|
||||
let filename = line.file_name().unwrap();
|
||||
let filename = filename.to_str().unwrap();
|
||||
let range = parse_filename(filename);
|
||||
ranges.push(range);
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
|
||||
use anyhow::{anyhow, Context};
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use nix::unistd::Pid;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tracing::*;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
@@ -18,14 +18,15 @@ use pageserver::{
|
||||
task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
},
|
||||
tenant_mgr, virtual_file,
|
||||
tenant::mgr,
|
||||
virtual_file,
|
||||
};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
lock_file, logging,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
sentry_init::{init_sentry, release_name},
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
};
|
||||
@@ -85,6 +86,9 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
|
||||
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
|
||||
@@ -124,7 +128,7 @@ fn initialize_config(
|
||||
);
|
||||
}
|
||||
// Supplement the CLI arguments with the config file
|
||||
let cfg_file_contents = std::fs::read_to_string(&cfg_file_path).with_context(|| {
|
||||
let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read pageserver config at '{}'",
|
||||
cfg_file_path.display()
|
||||
@@ -178,7 +182,7 @@ fn initialize_config(
|
||||
if update_config {
|
||||
info!("Writing pageserver config to '{}'", cfg_file_path.display());
|
||||
|
||||
std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| {
|
||||
std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
|
||||
format!(
|
||||
"Failed to write pageserver config to '{}'",
|
||||
cfg_file_path.display()
|
||||
@@ -198,8 +202,12 @@ fn initialize_config(
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
// Initialize logging
|
||||
logging::init(conf.log_format)?;
|
||||
|
||||
// Print version to the log, and expose it as a prometheus metric too.
|
||||
info!("version: {}", version());
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
@@ -215,53 +223,37 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
)
|
||||
}
|
||||
|
||||
// Create and lock PID file. This ensures that there cannot be more than one
|
||||
// pageserver process running at the same time.
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
|
||||
lock_file::LockCreationResult::Created {
|
||||
new_lock_contents,
|
||||
file,
|
||||
} => {
|
||||
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
|
||||
file
|
||||
}
|
||||
lock_file::LockCreationResult::AlreadyLocked {
|
||||
existing_lock_contents,
|
||||
} => anyhow::bail!(
|
||||
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
|
||||
conf.workdir,
|
||||
existing_lock_contents
|
||||
),
|
||||
lock_file::LockCreationResult::CreationFailed(e) => {
|
||||
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
|
||||
}
|
||||
};
|
||||
// ensure that the lock file is held even if the main thread of the process is panics
|
||||
// we need to release the lock file only when the current process is gone
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
let lock_file =
|
||||
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
|
||||
info!("Claimed pid file at {lock_file_path:?}");
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
// Ensure that the lock file is held even if the main thread of the process panics.
|
||||
// We need to release the lock file only when the process exits.
|
||||
std::mem::forget(lock_file);
|
||||
|
||||
// bind sockets before daemonizing so we report errors early and do not return until we are listening
|
||||
info!(
|
||||
"Starting pageserver http handler on {}",
|
||||
conf.listen_http_addr
|
||||
);
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
|
||||
// Bind the HTTP and libpq ports early, so that if they are in use by some other
|
||||
// process, we error out early.
|
||||
let http_addr = &conf.listen_http_addr;
|
||||
info!("Starting pageserver http handler on {http_addr}");
|
||||
let http_listener = tcp_listener::bind(http_addr)?;
|
||||
|
||||
info!(
|
||||
"Starting pageserver pg protocol handler on {}",
|
||||
conf.listen_pg_addr
|
||||
);
|
||||
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
|
||||
let pg_addr = &conf.listen_pg_addr;
|
||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
// Install signal handlers
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// start profiler (if enabled)
|
||||
// Start profiler (if enabled)
|
||||
let profiler_guard = profiling::init_profiler(conf);
|
||||
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_etcd_client(conf))?;
|
||||
// Launch broker client
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
// Initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
AuthType::NeonJWT => {
|
||||
@@ -272,46 +264,54 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
match var("ZENITH_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
// TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
|
||||
match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
|
||||
(old, Ok(v)) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
if let Ok(v_old) = old {
|
||||
warn!(
|
||||
"JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
|
||||
);
|
||||
if v_old != v {
|
||||
warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
|
||||
}
|
||||
}
|
||||
pageserver::config::SAFEKEEPER_AUTH_TOKEN
|
||||
.set(Arc::new(v))
|
||||
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
|
||||
}
|
||||
Err(VarError::NotPresent) => {
|
||||
(Ok(v), _) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
|
||||
pageserver::config::SAFEKEEPER_AUTH_TOKEN
|
||||
.set(Arc::new(v))
|
||||
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
|
||||
}
|
||||
(_, Err(VarError::NotPresent)) => {
|
||||
info!("No JWT token for authentication with Safekeeper detected");
|
||||
}
|
||||
Err(e) => {
|
||||
(_, Err(e)) => {
|
||||
return Err(e).with_context(|| {
|
||||
"Failed to either load to detect non-present ZENITH_AUTH_TOKEN environment variable"
|
||||
"Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|storage_config| {
|
||||
GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config)
|
||||
})
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
{
|
||||
let _rt_guard = BACKGROUND_RUNTIME.enter();
|
||||
tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
|
||||
};
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
|
||||
// Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
|
||||
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
// Start up the service to handle HTTP mgmt API request. We created the
|
||||
// listener earlier already.
|
||||
{
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
|
||||
|
||||
let router = http::make_router(conf, auth.clone(), remote_storage)?;
|
||||
let service =
|
||||
utils::http::RouterService::new(router.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
let router = http::make_router(conf, auth.clone(), remote_storage)?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown(task_mgr::shutdown_watcher());
|
||||
@@ -328,10 +328,31 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::MetricsCollection,
|
||||
None,
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
async move {
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.id,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection.
|
||||
// for each connection. We created the listener earlier already.
|
||||
task_mgr::spawn(
|
||||
COMPUTE_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::LibpqEndpointListener,
|
||||
@@ -344,8 +365,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
},
|
||||
);
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
signals.handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
@@ -369,6 +388,36 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
})
|
||||
}
|
||||
|
||||
fn create_remote_storage_client(
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<GenericRemoteStorage>> {
|
||||
let config = if let Some(config) = &conf.remote_storage_config {
|
||||
config
|
||||
} else {
|
||||
// No remote storage configured.
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
// Create the client
|
||||
let mut remote_storage = GenericRemoteStorage::from_config(config)?;
|
||||
|
||||
// If `test_remote_failures` is non-zero, wrap the client with a
|
||||
// wrapper that simulates failures.
|
||||
if conf.test_remote_failures > 0 {
|
||||
if !cfg!(feature = "testing") {
|
||||
anyhow::bail!("test_remote_failures option is not available because pageserver was compiled without the 'testing' feature");
|
||||
}
|
||||
info!(
|
||||
"Simulating remote failures for first {} attempts of each op",
|
||||
conf.test_remote_failures
|
||||
);
|
||||
remote_storage =
|
||||
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
|
||||
}
|
||||
|
||||
Ok(Some(remote_storage))
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
Command::new("Neon page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
|
||||
@@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
|
||||
let control_file = ControlFileData::decode(&std::fs::read(&control_file_path)?)?;
|
||||
let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
|
||||
println!("{control_file:?}");
|
||||
let control_file_initdb = Lsn(control_file.checkPoint);
|
||||
println!(
|
||||
@@ -79,7 +79,7 @@ fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
let metadata_bytes = std::fs::read(path)?;
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{meta:?}");
|
||||
let mut update_meta = false;
|
||||
@@ -110,7 +110,7 @@ fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), an
|
||||
|
||||
if update_meta {
|
||||
let metadata_bytes = meta.to_bytes()?;
|
||||
std::fs::write(&path, &metadata_bytes)?;
|
||||
std::fs::write(path, metadata_bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -5,12 +5,14 @@
|
||||
//! See also `settings.md` for better description on every parameter.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use std::env;
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
use reqwest::Url;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
@@ -18,25 +20,29 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use url::Url;
|
||||
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
logging::LogFormat,
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
use crate::{METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant_config::defaults::*;
|
||||
use crate::tenant::config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
|
||||
pub use pageserver_api::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
@@ -51,13 +57,14 @@ pub mod defaults {
|
||||
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
|
||||
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
|
||||
|
||||
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
pub const DEFAULT_CONFIG_FILE: &str = formatcp!(
|
||||
r###"
|
||||
# Initial configuration file created by 'pageserver --init'
|
||||
|
||||
#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
|
||||
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
|
||||
|
||||
@@ -69,10 +76,14 @@ pub mod defaults {
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}'
|
||||
|
||||
#log_format = '{DEFAULT_LOG_FORMAT}'
|
||||
|
||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||
|
||||
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -130,24 +141,26 @@ pub struct PageServerConf {
|
||||
pub profiling: ProfilingConfig,
|
||||
pub default_tenant_conf: TenantConf,
|
||||
|
||||
/// A prefix to add in etcd brokers before every key.
|
||||
/// Can be used for isolating different pageserver groups within the same etcd cluster.
|
||||
pub broker_etcd_prefix: String,
|
||||
|
||||
/// Etcd broker endpoints to connect to.
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
/// Storage broker endpoints to connect to.
|
||||
pub broker_endpoint: Uri,
|
||||
pub broker_keepalive_interval: Duration,
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
pub metric_collection_interval: Duration,
|
||||
pub metric_collection_endpoint: Option<Url>,
|
||||
|
||||
pub test_remote_failures: u64,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
/// and/or serialized at a whim, while the token is secret. Currently this token is the
|
||||
/// same for accessing all tenants/timelines, but may become per-tenant/per-timeline in
|
||||
/// the future, more tokens and auth may arrive for etcd and/or its rewrite (see
|
||||
/// https://github.com/neondatabase/neon/issues/2394), completely changing the logic.
|
||||
/// the future, more tokens and auth may arrive for storage broker, completely changing the logic.
|
||||
/// Hence, we resort to a global variable for now instead of passing the token from the
|
||||
/// startup code to the connection code through a dozen layers.
|
||||
pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
|
||||
@@ -214,12 +227,17 @@ struct PageServerConfigBuilder {
|
||||
id: BuilderValue<NodeId>,
|
||||
|
||||
profiling: BuilderValue<ProfilingConfig>,
|
||||
broker_etcd_prefix: BuilderValue<String>,
|
||||
broker_endpoints: BuilderValue<Vec<Url>>,
|
||||
broker_endpoint: BuilderValue<Uri>,
|
||||
broker_keepalive_interval: BuilderValue<Duration>,
|
||||
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
|
||||
|
||||
metric_collection_interval: BuilderValue<Duration>,
|
||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||
|
||||
test_remote_failures: BuilderValue<u64>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -245,11 +263,23 @@ impl Default for PageServerConfigBuilder {
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
profiling: Set(ProfilingConfig::Disabled),
|
||||
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
|
||||
broker_endpoints: Set(Vec::new()),
|
||||
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint")),
|
||||
broker_keepalive_interval: Set(humantime::parse_duration(
|
||||
storage_broker::DEFAULT_KEEPALIVE_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default keepalive interval")),
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
|
||||
metric_collection_interval: Set(humantime::parse_duration(
|
||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default metric collection interval")),
|
||||
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||
|
||||
test_remote_failures: Set(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -306,12 +336,12 @@ impl PageServerConfigBuilder {
|
||||
self.remote_storage_config = BuilderValue::Set(remote_storage_config)
|
||||
}
|
||||
|
||||
pub fn broker_endpoints(&mut self, broker_endpoints: Vec<Url>) {
|
||||
self.broker_endpoints = BuilderValue::Set(broker_endpoints)
|
||||
pub fn broker_endpoint(&mut self, broker_endpoint: Uri) {
|
||||
self.broker_endpoint = BuilderValue::Set(broker_endpoint)
|
||||
}
|
||||
|
||||
pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) {
|
||||
self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
|
||||
pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) {
|
||||
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
|
||||
}
|
||||
|
||||
pub fn id(&mut self, node_id: NodeId) {
|
||||
@@ -330,11 +360,19 @@ impl PageServerConfigBuilder {
|
||||
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let broker_endpoints = self
|
||||
.broker_endpoints
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?;
|
||||
pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) {
|
||||
self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
|
||||
}
|
||||
|
||||
pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
|
||||
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
||||
}
|
||||
|
||||
pub fn test_remote_failures(&mut self, fail_first: u64) {
|
||||
self.test_remote_failures = BuilderValue::Set(fail_first);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
@@ -370,16 +408,27 @@ impl PageServerConfigBuilder {
|
||||
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints,
|
||||
broker_etcd_prefix: self
|
||||
.broker_etcd_prefix
|
||||
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
|
||||
broker_endpoint: self
|
||||
.broker_endpoint
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?,
|
||||
broker_keepalive_interval: self
|
||||
.broker_keepalive_interval
|
||||
.ok_or(anyhow!("No broker keepalive interval provided"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_size_logical_size_queries: self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?,
|
||||
metric_collection_interval: self
|
||||
.metric_collection_interval
|
||||
.ok_or(anyhow!("missing metric_collection_interval"))?,
|
||||
metric_collection_endpoint: self
|
||||
.metric_collection_endpoint
|
||||
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
|
||||
test_remote_failures: self
|
||||
.test_remote_failures
|
||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -402,6 +451,10 @@ impl PageServerConf {
|
||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||
}
|
||||
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
@@ -450,6 +503,28 @@ impl PageServerConf {
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Files on the remote storage are stored with paths, relative to the workdir.
|
||||
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
|
||||
///
|
||||
/// Errors if the path provided does not start from pageserver's workdir.
|
||||
pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
|
||||
local_path
|
||||
.strip_prefix(&self.workdir)
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to resolve remote part of path {:?} for base {:?}",
|
||||
local_path, self.workdir
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Turns storage remote path of a file into its local path.
|
||||
pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
|
||||
remote_path.with_base(&self.workdir)
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
@@ -486,7 +561,7 @@ impl PageServerConf {
|
||||
let mut builder = PageServerConfigBuilder::default();
|
||||
builder.workdir(workdir.to_owned());
|
||||
|
||||
let mut t_conf: TenantConfOpt = Default::default();
|
||||
let mut t_conf = TenantConfOpt::default();
|
||||
|
||||
for (key, item) in toml.iter() {
|
||||
match key {
|
||||
@@ -507,24 +582,15 @@ impl PageServerConf {
|
||||
)),
|
||||
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
|
||||
"remote_storage" => {
|
||||
builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item)?))
|
||||
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
|
||||
}
|
||||
"tenant_config" => {
|
||||
t_conf = Self::parse_toml_tenant_conf(item)?;
|
||||
}
|
||||
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
||||
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
|
||||
"broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
|
||||
"broker_endpoints" => builder.broker_endpoints(
|
||||
parse_toml_array(key, item)?
|
||||
.into_iter()
|
||||
.map(|endpoint_str| {
|
||||
endpoint_str.parse::<Url>().with_context(|| {
|
||||
format!("Array item {endpoint_str} for key {key} is not a valid url endpoint")
|
||||
})
|
||||
})
|
||||
.collect::<anyhow::Result<_>>()?,
|
||||
),
|
||||
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
|
||||
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
|
||||
"log_format" => builder.log_format(
|
||||
LogFormat::from_config(&parse_toml_string(key, item)?)?
|
||||
),
|
||||
@@ -534,6 +600,13 @@ impl PageServerConf {
|
||||
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
|
||||
ConfigurableSemaphore::new(permits)
|
||||
}),
|
||||
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
"metric_collection_endpoint" => {
|
||||
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
||||
builder.metric_collection_endpoint(Some(endpoint));
|
||||
},
|
||||
|
||||
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -617,6 +690,12 @@ impl PageServerConf {
|
||||
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
|
||||
t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
|
||||
}
|
||||
if let Some(trace_read_requests) = item.get("trace_read_requests") {
|
||||
t_conf.trace_read_requests =
|
||||
Some(trace_read_requests.as_bool().with_context(|| {
|
||||
"configure option trace_read_requests is not a bool".to_string()
|
||||
})?);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
@@ -644,11 +723,14 @@ impl PageServerConf {
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::dummy_conf(),
|
||||
broker_endpoints: Vec::new(),
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
broker_keepalive_interval: Duration::from_secs(5000),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
test_remote_failures: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -698,22 +780,6 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
|
||||
let array = item
|
||||
.as_array()
|
||||
.with_context(|| format!("configure option {name} is not an array"))?;
|
||||
|
||||
array
|
||||
.iter()
|
||||
.map(|value| {
|
||||
value
|
||||
.as_str()
|
||||
.map(str::to_string)
|
||||
.with_context(|| format!("Array item {value:?} for key {name} is not a string"))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Configurable semaphore permits setting.
|
||||
///
|
||||
/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
|
||||
@@ -795,6 +861,8 @@ max_file_descriptors = 333
|
||||
initial_superuser_name = 'zzzz'
|
||||
id = 10
|
||||
|
||||
metric_collection_interval = '222 s'
|
||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
log_format = 'json'
|
||||
|
||||
"#;
|
||||
@@ -803,10 +871,10 @@ log_format = 'json'
|
||||
fn parse_defaults() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
|
||||
// we have to create dummy values to overcome the validation errors
|
||||
let config_string = format!(
|
||||
"pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']",
|
||||
"pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
@@ -832,12 +900,17 @@ log_format = 'json'
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: vec![broker_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
broker_keepalive_interval: humantime::parse_duration(
|
||||
storage_broker::DEFAULT_KEEPALIVE_INTERVAL
|
||||
)?,
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
test_remote_failures: 0,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -849,10 +922,10 @@ log_format = 'json'
|
||||
fn parse_basic_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
|
||||
|
||||
let config_string = format!(
|
||||
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']",
|
||||
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'",
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
@@ -878,12 +951,13 @@ log_format = 'json'
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: vec![broker_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(),
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: Duration::from_secs(222),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
test_remote_failures: 0,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -915,7 +989,7 @@ local_path = '{}'"#,
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
pg_distrib_dir.display(),
|
||||
@@ -982,7 +1056,7 @@ concurrency_limit = {s3_concurrency_limit}"#
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
pg_distrib_dir.display(),
|
||||
@@ -1016,6 +1090,35 @@ broker_endpoints = ['{broker_endpoint}']
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_tenant_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
let trace_read_requests = true;
|
||||
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoint = '{broker_endpoint}'
|
||||
|
||||
[tenant_config]
|
||||
trace_read_requests = {trace_read_requests}"#,
|
||||
pg_distrib_dir.display(),
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
|
||||
assert_eq!(
|
||||
conf.default_tenant_conf.trace_read_requests, trace_read_requests,
|
||||
"Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
|
||||
let tempdir_path = tempdir.path();
|
||||
|
||||
|
||||
324
pageserver/src/consumption_metrics.rs
Normal file
324
pageserver/src/consumption_metrics.rs
Normal file
@@ -0,0 +1,324 @@
|
||||
//!
|
||||
//! Periodically collect consumption metrics for all active tenants
|
||||
//! and push them to a HTTP endpoint.
|
||||
//! Cache metrics to send only the updated ones.
|
||||
//!
|
||||
|
||||
use anyhow;
|
||||
use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::task_mgr;
|
||||
use crate::tenant::mgr;
|
||||
use pageserver_api::models::TenantState;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use rand::Rng;
|
||||
use reqwest::Url;
|
||||
|
||||
/// ConsumptionMetric struct that defines the format for one metric entry
|
||||
/// i.e.
|
||||
///
|
||||
/// ```json
|
||||
/// {
|
||||
/// "metric": "remote_storage_size",
|
||||
/// "type": "absolute",
|
||||
/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||
/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
||||
/// "time": "2022-12-28T11:07:19.317310284Z",
|
||||
/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||
/// "value": 12345454,
|
||||
/// }
|
||||
/// ```
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
pub struct ConsumptionMetric {
|
||||
pub metric: ConsumptionMetricKind,
|
||||
#[serde(rename = "type")]
|
||||
pub metric_type: &'static str,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub time: DateTime<Utc>,
|
||||
pub idempotency_key: String,
|
||||
pub value: u64,
|
||||
}
|
||||
|
||||
impl ConsumptionMetric {
|
||||
pub fn new_absolute<R: Rng + ?Sized>(
|
||||
metric: ConsumptionMetricKind,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
value: u64,
|
||||
node_id: NodeId,
|
||||
rng: &mut R,
|
||||
) -> Self {
|
||||
Self {
|
||||
metric,
|
||||
metric_type: "absolute",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
time: Utc::now(),
|
||||
// key that allows metric collector to distinguish unique events
|
||||
idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
|
||||
value,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ConsumptionMetricKind {
|
||||
/// Amount of WAL produced , by a timeline, i.e. last_record_lsn
|
||||
/// This is an absolute, per-timeline metric.
|
||||
WrittenSize,
|
||||
/// Size of all tenant branches including WAL
|
||||
/// This is an absolute, per-tenant metric.
|
||||
/// This is the same metric that tenant/tenant_id/size endpoint returns.
|
||||
SyntheticStorageSize,
|
||||
/// Size of all the layer files in the tenant's directory on disk on the pageserver.
|
||||
/// This is an absolute, per-tenant metric.
|
||||
/// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
|
||||
ResidentSize,
|
||||
/// Size of the remote storage (S3) directory.
|
||||
/// This is an absolute, per-tenant metric.
|
||||
RemoteStorageSize,
|
||||
/// Logical size of the data in the timeline
|
||||
/// This is an absolute, per-timeline metric
|
||||
TimelineLogicalSize,
|
||||
}
|
||||
|
||||
impl FromStr for ConsumptionMetricKind {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"written_size" => Ok(Self::WrittenSize),
|
||||
"synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
|
||||
"resident_size" => Ok(Self::ResidentSize),
|
||||
"remote_storage_size" => Ok(Self::RemoteStorageSize),
|
||||
"timeline_logical_size" => Ok(Self::TimelineLogicalSize),
|
||||
_ => anyhow::bail!("invalid value \"{s}\" for metric type"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ConsumptionMetricKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(match self {
|
||||
ConsumptionMetricKind::WrittenSize => "written_size",
|
||||
ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
|
||||
ConsumptionMetricKind::ResidentSize => "resident_size",
|
||||
ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
|
||||
ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct ConsumptionMetricsKey {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
metric: ConsumptionMetricKind,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct EventChunk<'a> {
|
||||
events: &'a [ConsumptionMetric],
|
||||
}
|
||||
|
||||
/// Main thread that serves metrics collection
|
||||
pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_interval: Duration,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
info!("starting collect_metrics");
|
||||
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("collect_metrics received cancellation request");
|
||||
return Ok(());
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// One iteration of metrics collection
|
||||
///
|
||||
/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
|
||||
/// Cache metrics to avoid sending the same metrics multiple times.
|
||||
pub async fn collect_metrics_task(
|
||||
client: &reqwest::Client,
|
||||
cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
|
||||
trace!(
|
||||
"starting collect_metrics_task. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
// get list of tenants
|
||||
let tenants = mgr::list_tenants().await;
|
||||
|
||||
// iterate through list of Active tenants and collect metrics
|
||||
for (tenant_id, tenant_state) in tenants {
|
||||
if tenant_state != TenantState::Active {
|
||||
continue;
|
||||
}
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
// iterate through list of timelines in tenant
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
// collect per-timeline metrics only for active timelines
|
||||
if timeline.is_active() {
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::WrittenSize,
|
||||
},
|
||||
timeline_written_size,
|
||||
));
|
||||
|
||||
let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
if is_exact {
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::TimelineLogicalSize,
|
||||
},
|
||||
timeline_logical_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_resident_size = timeline.get_resident_physical_size();
|
||||
tenant_resident_size += timeline_resident_size;
|
||||
}
|
||||
|
||||
let tenant_remote_size = tenant.get_remote_size().await?;
|
||||
debug!(
|
||||
"collected current metrics for tenant: {}: state={:?} resident_size={} remote_size={}",
|
||||
tenant_id, tenant_state, tenant_resident_size, tenant_remote_size
|
||||
);
|
||||
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: ConsumptionMetricKind::ResidentSize,
|
||||
},
|
||||
tenant_resident_size,
|
||||
));
|
||||
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: ConsumptionMetricKind::RemoteStorageSize,
|
||||
},
|
||||
tenant_remote_size,
|
||||
));
|
||||
|
||||
// TODO add SyntheticStorageSize metric
|
||||
}
|
||||
|
||||
// Filter metrics
|
||||
current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
|
||||
Some(val) => val != curr_val,
|
||||
None => true,
|
||||
});
|
||||
|
||||
if current_metrics.is_empty() {
|
||||
trace!("no new metrics to send");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Send metrics.
|
||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||
const CHUNK_SIZE: usize = 1000;
|
||||
let chunks = current_metrics.chunks(CHUNK_SIZE);
|
||||
|
||||
let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
|
||||
|
||||
for chunk in chunks {
|
||||
chunk_to_send.clear();
|
||||
|
||||
// this code block is needed to convince compiler
|
||||
// that rng is not reused aroung await point
|
||||
{
|
||||
// enrich metrics with timestamp and metric_kind before sending
|
||||
let mut rng = rand::thread_rng();
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
|
||||
ConsumptionMetric::new_absolute(
|
||||
curr_key.metric,
|
||||
curr_key.tenant_id,
|
||||
curr_key.timeline_id,
|
||||
*curr_val,
|
||||
node_id,
|
||||
&mut rng,
|
||||
)
|
||||
}));
|
||||
}
|
||||
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
|
||||
events: &chunk_to_send,
|
||||
})
|
||||
.expect("ConsumptionMetric should not fail serialization");
|
||||
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk_json)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(res) => {
|
||||
if res.status().is_success() {
|
||||
// update cached metrics after they were sent successfully
|
||||
for (curr_key, curr_val) in chunk.iter() {
|
||||
cached_metrics.insert(curr_key.clone(), *curr_val);
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!("failed to send metrics: {:?}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -77,16 +77,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: include-non-incremental-logical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_logical_size_non_incremental
|
||||
- name: include-non-incremental-physical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_physical_size_non_incremental
|
||||
get:
|
||||
description: Get timelines for tenant
|
||||
responses:
|
||||
@@ -139,17 +129,6 @@ paths:
|
||||
format: hex
|
||||
get:
|
||||
description: Get info about the timeline
|
||||
parameters:
|
||||
- name: include-non-incremental-logical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_logical_size_non_incremental
|
||||
- name: include-non-incremental-physical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_physical_size_non_incremental
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
@@ -274,6 +253,7 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
post:
|
||||
description: Schedules attach operation to happen in the background for given tenant
|
||||
responses:
|
||||
@@ -325,7 +305,9 @@ paths:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Detach local tenant
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
|
||||
Files on the remote storage are not affected.
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant detached
|
||||
@@ -354,6 +336,92 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory.
|
||||
Files on local disk and remote storage are not affected.
|
||||
|
||||
Future pageserver restarts won't load the data back until `load` is called on such tenant.
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant ignored
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/load:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: |
|
||||
Schedules an operation that attempts to load a tenant from the local disk and
|
||||
synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
|
||||
If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
|
||||
|
||||
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
|
||||
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -659,7 +727,6 @@ components:
|
||||
- tenant_id
|
||||
- last_record_lsn
|
||||
- disk_consistent_lsn
|
||||
- awaits_download
|
||||
- state
|
||||
- latest_gc_cutoff_lsn
|
||||
properties:
|
||||
@@ -691,10 +758,6 @@ components:
|
||||
type: integer
|
||||
current_physical_size:
|
||||
type: integer
|
||||
current_logical_size_non_incremental:
|
||||
type: integer
|
||||
current_physical_size_non_incremental:
|
||||
type: integer
|
||||
wal_source_connstr:
|
||||
type: string
|
||||
last_received_msg_lsn:
|
||||
@@ -702,44 +765,11 @@ components:
|
||||
format: hex
|
||||
last_received_msg_ts:
|
||||
type: integer
|
||||
awaits_download:
|
||||
type: boolean
|
||||
state:
|
||||
type: string
|
||||
latest_gc_cutoff_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
# These 'local' and 'remote' fields just duplicate some of the fields
|
||||
# above. They are kept for backwards-compatibility. They can be removed,
|
||||
# when the control plane has been updated to look at the above fields
|
||||
# directly.
|
||||
local:
|
||||
$ref: "#/components/schemas/LocalTimelineInfo"
|
||||
remote:
|
||||
$ref: "#/components/schemas/RemoteTimelineInfo"
|
||||
|
||||
LocalTimelineInfo:
|
||||
type: object
|
||||
properties:
|
||||
ancestor_timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
ancestor_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
current_logical_size:
|
||||
type: integer
|
||||
current_physical_size:
|
||||
type: integer
|
||||
RemoteTimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
- remote_consistent_lsn
|
||||
properties:
|
||||
remote_consistent_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -3,19 +3,18 @@ use std::sync::Arc;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::task::JoinError;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use super::models::{
|
||||
LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
|
||||
TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::{config::PageServerConf, tenant_mgr};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{with_ondemand_download, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::{
|
||||
@@ -32,8 +31,6 @@ use utils::{
|
||||
// Imports only used for testing APIs
|
||||
#[cfg(feature = "testing")]
|
||||
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
|
||||
#[cfg(feature = "testing")]
|
||||
use crate::CheckpointConfig;
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -81,28 +78,28 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
}
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
fn build_timeline_info(
|
||||
tenant_state: TenantState,
|
||||
async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
include_non_incremental_physical_size: bool,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
let mut info = build_timeline_info_common(tenant_state, timeline)?;
|
||||
let mut info = build_timeline_info_common(timeline)?;
|
||||
if include_non_incremental_logical_size {
|
||||
info.current_logical_size_non_incremental =
|
||||
Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
|
||||
}
|
||||
if include_non_incremental_physical_size {
|
||||
info.current_physical_size_non_incremental =
|
||||
Some(timeline.get_physical_size_non_incremental()?)
|
||||
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
||||
// Otherwise, if someone deletes the timeline / detaches the tenant while
|
||||
// we're executing this function, we will outlive the timeline on-disk state.
|
||||
info.current_logical_size_non_incremental = Some(
|
||||
timeline
|
||||
.get_current_logical_size_non_incremental(
|
||||
info.last_record_lsn,
|
||||
CancellationToken::new(),
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
fn build_timeline_info_common(
|
||||
tenant_state: TenantState,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
|
||||
let guard = timeline.last_received_wal.lock().unwrap();
|
||||
@@ -123,13 +120,13 @@ fn build_timeline_info_common(
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
};
|
||||
let current_logical_size = match timeline.get_current_logical_size() {
|
||||
Ok(size) => Some(size),
|
||||
Ok((size, _)) => Some(size),
|
||||
Err(err) => {
|
||||
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||
None
|
||||
}
|
||||
};
|
||||
let current_physical_size = Some(timeline.get_physical_size());
|
||||
let current_physical_size = Some(timeline.layer_size_sum().approximate_is_ok());
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
@@ -146,29 +143,13 @@ fn build_timeline_info_common(
|
||||
current_logical_size,
|
||||
current_physical_size,
|
||||
current_logical_size_non_incremental: None,
|
||||
current_physical_size_non_incremental: None,
|
||||
timeline_dir_layer_file_size_sum: None,
|
||||
wal_source_connstr,
|
||||
last_received_msg_lsn,
|
||||
last_received_msg_ts,
|
||||
pg_version: timeline.pg_version,
|
||||
|
||||
state,
|
||||
|
||||
// XXX bring back tracking of downloads per timeline, or, introduce
|
||||
// an 'Attaching' state for the timeline and get rid of this field.
|
||||
awaits_download: tenant_state == TenantState::Attaching,
|
||||
|
||||
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
|
||||
// with the control plane.
|
||||
local: LocalTimelineInfo {
|
||||
ancestor_timeline_id,
|
||||
ancestor_lsn,
|
||||
current_logical_size,
|
||||
current_physical_size,
|
||||
},
|
||||
remote: RemoteTimelineInfo {
|
||||
remote_consistent_lsn: Some(remote_consistent_lsn),
|
||||
},
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
@@ -189,7 +170,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
.new_timeline_id
|
||||
.unwrap_or_else(TimelineId::generate);
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
@@ -200,7 +183,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
.await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(tenant.current_state(), &new_timeline)
|
||||
let timeline_info = build_timeline_info_common(&new_timeline)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
@@ -213,30 +196,30 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let include_non_incremental_logical_size =
|
||||
query_param_present(&request, "include-non-incremental-logical-size");
|
||||
let include_non_incremental_physical_size =
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let _entered = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
let response_data = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let (tenant_state, timelines) = {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
(tenant.current_state(), tenant.list_timelines())
|
||||
};
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
let timeline_info =
|
||||
build_timeline_info(&timeline, include_non_incremental_logical_size)
|
||||
.await
|
||||
.context(
|
||||
"Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
|
||||
)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for timeline in timelines {
|
||||
let timeline_info = build_timeline_info(
|
||||
tenant_state,
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
response_data.push(timeline_info);
|
||||
response_data.push(timeline_info);
|
||||
}
|
||||
Ok(response_data)
|
||||
}
|
||||
.instrument(info_span!("timeline_list", tenant = %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
@@ -276,31 +259,21 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let include_non_incremental_logical_size =
|
||||
query_param_present(&request, "include-non-incremental-logical-size");
|
||||
let include_non_incremental_physical_size =
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_info = async {
|
||||
let (tenant_state, timeline) = tokio::task::spawn_blocking(move || {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
Ok((
|
||||
tenant.current_state(),
|
||||
tenant.get_timeline(timeline_id, false),
|
||||
))
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline = timeline.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
tenant_state,
|
||||
&timeline,
|
||||
include_non_incremental_logical_size,
|
||||
include_non_incremental_physical_size,
|
||||
)
|
||||
.context("Failed to get local timeline info: {e:#}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
|
||||
.await
|
||||
.context("Failed to get local timeline info: {e:#}")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok::<_, ApiError>(timeline_info)
|
||||
}
|
||||
@@ -321,13 +294,15 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
|
||||
|
||||
let timeline = tenant_mgr::get_tenant(tenant_id, true)
|
||||
let timeline = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let result = match timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg)
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
{
|
||||
let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
@@ -347,13 +322,13 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
if let Some(remote_storage) = &state.remote_storage {
|
||||
// FIXME: distinguish between "Tenant already exists" and other errors
|
||||
tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage)
|
||||
mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
|
||||
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
} else {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"attach_tenant is possible because pageserver was configured without remote storage"
|
||||
"attach_tenant is not possible because pageserver was configured without remote storage"
|
||||
)));
|
||||
}
|
||||
|
||||
@@ -365,7 +340,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
tenant_mgr::delete_timeline(tenant_id, timeline_id)
|
||||
mgr::delete_timeline(tenant_id, timeline_id)
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
// FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
|
||||
@@ -382,7 +357,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
tenant_mgr::detach_tenant(conf, tenant_id)
|
||||
mgr::detach_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
|
||||
.await
|
||||
// FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
|
||||
@@ -392,23 +367,49 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
|
||||
.instrument(info_span!("load", tenant = %tenant_id))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::ignore_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
|
||||
.await
|
||||
// FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
|
||||
// Replace this with better handling once the error type permits it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
tenant_mgr::list_tenants()
|
||||
.iter()
|
||||
.map(|(id, state)| TenantInfo {
|
||||
id: *id,
|
||||
state: *state,
|
||||
current_physical_size: None,
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>()
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
let response_data = mgr::list_tenants()
|
||||
.instrument(info_span!("tenant_list"))
|
||||
.await
|
||||
.iter()
|
||||
.map(|(id, state)| TenantInfo {
|
||||
id: *id,
|
||||
state: *state,
|
||||
current_physical_size: None,
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
@@ -417,28 +418,25 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant_info = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_status_handler", tenant = %tenant_id).entered();
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
|
||||
let tenant_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
current_physical_size += timeline.get_physical_size();
|
||||
current_physical_size += timeline.layer_size_sum().approximate_is_ok();
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
let tenant_info = TenantInfo {
|
||||
Ok(TenantInfo {
|
||||
id: tenant_id,
|
||||
state,
|
||||
current_physical_size: Some(current_physical_size),
|
||||
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
|
||||
};
|
||||
|
||||
Ok::<_, anyhow::Error>(tenant_info)
|
||||
})
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
@@ -448,7 +446,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::InternalServerError)?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// this can be long operation, it currently is not backed by any request coalescing or similar
|
||||
let inputs = tenant
|
||||
@@ -565,22 +565,19 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
.map(TenantId::from)
|
||||
.unwrap_or_else(TenantId::generate);
|
||||
|
||||
let new_tenant = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
|
||||
let state = get_state(&request);
|
||||
let state = get_state(&request);
|
||||
|
||||
tenant_mgr::create_tenant(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
state.remote_storage.clone(),
|
||||
)
|
||||
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
|
||||
// with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)
|
||||
})
|
||||
let new_tenant = mgr::create_tenant(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
state.remote_storage.clone(),
|
||||
)
|
||||
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
|
||||
// with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok(match new_tenant {
|
||||
Some(tenant) => {
|
||||
@@ -671,17 +668,13 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
);
|
||||
}
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
|
||||
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
||||
// Replace this `map_err` with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
let state = get_state(&request);
|
||||
mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
|
||||
.await
|
||||
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
||||
// Replace this `map_err` with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -728,7 +721,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req)?;
|
||||
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
|
||||
let gc_result = wait_task_done
|
||||
.await
|
||||
.context("wait for gc task")
|
||||
@@ -745,7 +738,9 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
@@ -764,18 +759,63 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline
|
||||
.checkpoint(CheckpointConfig::Forced)
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_download_remote_layers_handler_post(
|
||||
request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
match timeline.spawn_download_all_remote_layers().await {
|
||||
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
||||
Err(st) => json_response(StatusCode::CONFLICT, st),
|
||||
}
|
||||
}
|
||||
|
||||
async fn timeline_download_remote_layers_handler_get(
|
||||
request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let info = timeline
|
||||
.get_download_all_remote_layers_task_info()
|
||||
.context("task never started since last pageserver process start")
|
||||
.map_err(ApiError::NotFound)?;
|
||||
json_response(StatusCode::OK, info)
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -838,6 +878,8 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
|
||||
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
|
||||
.post("/v1/tenant/:tenant_id/load", tenant_load_handler)
|
||||
.post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
@@ -858,6 +900,14 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|
||||
testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||
timeline_download_remote_layers_handler_post,
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||
timeline_download_remote_layers_handler_get,
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_delete_handler,
|
||||
|
||||
@@ -187,13 +187,13 @@ fn import_slru<Reader: Read>(
|
||||
path: &Path,
|
||||
mut reader: Reader,
|
||||
len: usize,
|
||||
) -> Result<()> {
|
||||
trace!("importing slru file {}", path.display());
|
||||
) -> anyhow::Result<()> {
|
||||
info!("importing slru file {path:?}");
|
||||
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let filename = &path
|
||||
.file_name()
|
||||
.expect("missing slru filename")
|
||||
.with_context(|| format!("missing slru filename for path {path:?}"))?
|
||||
.to_string_lossy();
|
||||
let segno = u32::from_str_radix(filename, 16)?;
|
||||
|
||||
@@ -237,14 +237,19 @@ fn import_slru<Reader: Read>(
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory and load all records between
|
||||
/// 'startpoint' and 'endpoint' into the repository.
|
||||
fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
|
||||
fn import_wal(
|
||||
walpath: &Path,
|
||||
tline: &Timeline,
|
||||
startpoint: Lsn,
|
||||
endpoint: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
|
||||
|
||||
let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let mut walingest = WalIngest::new(tline, startpoint)?;
|
||||
let mut walingest = WalIngest::new(tline, startpoint).no_ondemand_download()?;
|
||||
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
@@ -267,7 +272,7 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
|
||||
}
|
||||
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != WAL_SEGMENT_SIZE - offset as usize {
|
||||
if nread != WAL_SEGMENT_SIZE - offset {
|
||||
// Maybe allow this for .partial files?
|
||||
error!("read only {} bytes from WAL file", nread);
|
||||
}
|
||||
@@ -279,7 +284,9 @@ fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn)
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
|
||||
.no_ondemand_download()?;
|
||||
last_lsn = lsn;
|
||||
|
||||
nrecords += 1;
|
||||
@@ -360,7 +367,7 @@ pub fn import_wal_from_tar<Reader: Read>(
|
||||
let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = start_lsn;
|
||||
let mut walingest = WalIngest::new(tline, start_lsn)?;
|
||||
let mut walingest = WalIngest::new(tline, start_lsn).no_ondemand_download()?;
|
||||
|
||||
// Ingest wal until end_lsn
|
||||
info!("importing wal until {}", end_lsn);
|
||||
@@ -405,7 +412,9 @@ pub fn import_wal_from_tar<Reader: Read>(
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest.ingest_record(recdata, lsn, &mut modification, &mut decoded)?;
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
|
||||
.no_ondemand_download()?;
|
||||
last_lsn = lsn;
|
||||
|
||||
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||
@@ -440,16 +449,22 @@ fn import_file<Reader: Read>(
|
||||
reader: Reader,
|
||||
len: usize,
|
||||
) -> Result<Option<ControlFileData>> {
|
||||
let file_name = match file_path.file_name() {
|
||||
Some(name) => name.to_string_lossy(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
if file_name.starts_with('.') {
|
||||
// tar archives on macOs, created without COPYFILE_DISABLE=1 env var
|
||||
// will contain "fork files", skip them.
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
if file_path.starts_with("global") {
|
||||
let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
|
||||
let dbnode = 0;
|
||||
|
||||
match file_path
|
||||
.file_name()
|
||||
.expect("missing filename")
|
||||
.to_string_lossy()
|
||||
.as_ref()
|
||||
{
|
||||
match file_name.as_ref() {
|
||||
"pg_control" => {
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
|
||||
@@ -485,12 +500,7 @@ fn import_file<Reader: Read>(
|
||||
.to_string_lossy()
|
||||
.parse()?;
|
||||
|
||||
match file_path
|
||||
.file_name()
|
||||
.expect("missing base filename")
|
||||
.to_string_lossy()
|
||||
.as_ref()
|
||||
{
|
||||
match file_name.as_ref() {
|
||||
"pg_filenode.map" => {
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||
@@ -520,11 +530,7 @@ fn import_file<Reader: Read>(
|
||||
import_slru(modification, slru, file_path, reader, len)?;
|
||||
debug!("imported multixact members slru");
|
||||
} else if file_path.starts_with("pg_twophase") {
|
||||
let file_name = &file_path
|
||||
.file_name()
|
||||
.expect("missing twophase filename")
|
||||
.to_string_lossy();
|
||||
let xid = u32::from_str_radix(file_name, 16)?;
|
||||
let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
|
||||
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
mod auth;
|
||||
pub mod basebackup;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
@@ -10,13 +11,8 @@ pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod profiling;
|
||||
pub mod repository;
|
||||
pub mod storage_sync2;
|
||||
pub use storage_sync2 as storage_sync;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
pub mod tenant_config;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_tasks;
|
||||
pub mod trace;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
@@ -26,9 +22,8 @@ pub mod walredo;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
@@ -47,15 +42,6 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
/// Config for the Repository checkpointer
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum CheckpointConfig {
|
||||
// Flush all in-memory data
|
||||
Flush,
|
||||
// Flush all in-memory data and reconstruct all page images
|
||||
Forced,
|
||||
}
|
||||
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
@@ -66,7 +52,7 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
tenant_mgr::shutdown_all_tenants().await;
|
||||
tenant::mgr::shutdown_all_tenants().await;
|
||||
|
||||
// Stop syncing with remote storage.
|
||||
//
|
||||
@@ -99,7 +85,7 @@ async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
}
|
||||
}
|
||||
|
||||
fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
} else {
|
||||
@@ -125,6 +111,13 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
|
||||
/// into pageserver's memory before being ignored.
|
||||
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
|
||||
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
|
||||
|
||||
pub fn is_temporary(path: &Path) -> bool {
|
||||
match path.file_name() {
|
||||
Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
|
||||
|
||||
@@ -84,13 +84,20 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics for determining timeline's physical size.
|
||||
// A layered timeline's physical is defined as the total size of
|
||||
// (delta/image) layer files on disk.
|
||||
static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_current_physical_size",
|
||||
"Current physical size grouped by timeline",
|
||||
"pageserver_resident_physical_size",
|
||||
"The size of the layer files present in the pageserver's filesystem.",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_remote_physical_size",
|
||||
"The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
|
||||
// Corollary: If any files are missing from the index part, they won't be included here.
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -136,8 +143,9 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
1.0, // 1 sec
|
||||
];
|
||||
|
||||
const STORAGE_IO_TIME_OPERATIONS: &[&str] =
|
||||
&["open", "close", "read", "write", "seek", "fsync", "gc"];
|
||||
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
|
||||
"open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
|
||||
];
|
||||
|
||||
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
|
||||
|
||||
@@ -201,7 +209,7 @@ pub static NUM_ONDISK_LAYERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
|
||||
// remote storage metrics
|
||||
|
||||
pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_remote_upload_queue_unfinished_tasks",
|
||||
"Number of tasks in the upload queue that are not finished yet.",
|
||||
@@ -210,14 +218,14 @@ pub static REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS: Lazy<IntGaugeVec> = Lazy::new(|
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
Download,
|
||||
Delete,
|
||||
}
|
||||
impl RemoteOpKind {
|
||||
pub fn as_str(&self) -> &str {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Upload => "upload",
|
||||
Self::Download => "download",
|
||||
@@ -226,13 +234,13 @@ impl RemoteOpKind {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
pub enum RemoteOpFileKind {
|
||||
Layer,
|
||||
Index,
|
||||
}
|
||||
impl RemoteOpFileKind {
|
||||
pub fn as_str(&self) -> &str {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Layer => "layer",
|
||||
Self::Index => "index",
|
||||
@@ -365,7 +373,7 @@ pub struct TimelineMetrics {
|
||||
pub load_layer_map_histo: Histogram,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
pub current_physical_size_gauge: UIntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
@@ -406,7 +414,7 @@ impl TimelineMetrics {
|
||||
let wait_lsn_time_histo = WAIT_LSN_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
@@ -432,7 +440,7 @@ impl TimelineMetrics {
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
wait_lsn_time_histo,
|
||||
current_physical_size_gauge,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
num_persistent_files_created,
|
||||
persistent_bytes_written,
|
||||
@@ -448,7 +456,7 @@ impl Drop for TimelineMetrics {
|
||||
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = CURRENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
@@ -491,10 +499,114 @@ pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
|
||||
use futures::Future;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::collections::HashMap;
|
||||
use std::pin::Pin;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::Instant;
|
||||
|
||||
pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
|
||||
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
|
||||
unfinished_tasks: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
}
|
||||
|
||||
impl RemoteTimelineClientMetrics {
|
||||
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_id.to_string(),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
remote_operation_time: Mutex::new(HashMap::default()),
|
||||
unfinished_tasks: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
|
||||
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
|
||||
guard
|
||||
.get_or_insert_with(|| {
|
||||
REMOTE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
])
|
||||
.unwrap()
|
||||
})
|
||||
.clone()
|
||||
}
|
||||
pub fn remote_operation_time(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
status: &'static str,
|
||||
) -> Histogram {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.remote_operation_time.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str(), status);
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
key.2,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
pub fn unfinished_tasks(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntGauge {
|
||||
// XXX would be nice to have an upgradable RwLock
|
||||
let mut guard = self.unfinished_tasks.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RemoteTimelineClientMetrics {
|
||||
fn drop(&mut self) {
|
||||
let RemoteTimelineClientMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
remote_physical_size_gauge,
|
||||
remote_operation_time,
|
||||
unfinished_tasks,
|
||||
} = self;
|
||||
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
|
||||
}
|
||||
for ((a, b), _) in unfinished_tasks.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
{
|
||||
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
|
||||
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper future that measures the time spent by a remote storage operation,
|
||||
/// and records the time and success/failure as a prometheus metric.
|
||||
pub trait MeasureRemoteOp: Sized {
|
||||
@@ -504,6 +616,7 @@ pub trait MeasureRemoteOp: Sized {
|
||||
timeline_id: TimelineId,
|
||||
file_kind: RemoteOpFileKind,
|
||||
op: RemoteOpKind,
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
) -> MeasuredRemoteOp<Self> {
|
||||
let start = Instant::now();
|
||||
MeasuredRemoteOp {
|
||||
@@ -513,6 +626,7 @@ pub trait MeasureRemoteOp: Sized {
|
||||
file_kind,
|
||||
op,
|
||||
start,
|
||||
metrics,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -529,6 +643,7 @@ pin_project! {
|
||||
file_kind: RemoteOpFileKind,
|
||||
op: RemoteOpKind,
|
||||
start: Instant,
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,15 +656,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
if let Poll::Ready(ref res) = poll_result {
|
||||
let duration = this.start.elapsed();
|
||||
let status = if res.is_ok() { &"success" } else { &"failure" };
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
&this.tenant_id.to_string(),
|
||||
&this.timeline_id.to_string(),
|
||||
this.file_kind.as_str(),
|
||||
this.op.as_str(),
|
||||
status,
|
||||
])
|
||||
.unwrap()
|
||||
this.metrics
|
||||
.remote_operation_time(this.file_kind, this.op, status)
|
||||
.observe(duration.as_secs_f64());
|
||||
}
|
||||
poll_result
|
||||
|
||||
@@ -48,10 +48,9 @@ use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
|
||||
use crate::profiling::profpoint_start;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::tenant_mgr;
|
||||
use crate::trace::Tracer;
|
||||
use crate::CheckpointConfig;
|
||||
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -445,9 +444,7 @@ impl PageServerHandler {
|
||||
pgb.flush().await?;
|
||||
let mut copyin_stream = Box::pin(copyin_stream(pgb));
|
||||
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
|
||||
tokio::task::block_in_place(|| {
|
||||
import_wal_from_tar(&*timeline, reader, start_lsn, end_lsn)
|
||||
})?;
|
||||
tokio::task::block_in_place(|| import_wal_from_tar(&timeline, reader, start_lsn, end_lsn))?;
|
||||
info!("wal import complete");
|
||||
|
||||
// Drain the rest of the Copy data
|
||||
@@ -466,7 +463,7 @@ impl PageServerHandler {
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
// shape of deltas or images.
|
||||
info!("flushing layers");
|
||||
timeline.checkpoint(CheckpointConfig::Flush).await?;
|
||||
timeline.freeze_and_flush().await?;
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
@@ -542,7 +539,10 @@ impl PageServerHandler {
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
|
||||
let exists = timeline.get_rel_exists(req.rel, lsn, req.latest)?;
|
||||
let exists = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_rel_exists(req.rel, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
exists,
|
||||
@@ -559,7 +559,10 @@ impl PageServerHandler {
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
|
||||
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest)?;
|
||||
let n_blocks = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_rel_size(req.rel, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
@@ -576,9 +579,10 @@ impl PageServerHandler {
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
|
||||
let total_blocks =
|
||||
timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)?;
|
||||
|
||||
let total_blocks = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||
|
||||
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
||||
@@ -604,11 +608,14 @@ impl PageServerHandler {
|
||||
}
|
||||
*/
|
||||
|
||||
// FIXME: this profiling now happens at different place than it used to. The
|
||||
// current profiling is based on a thread-local variable, so it doesn't work
|
||||
// across awaits
|
||||
let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
|
||||
let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)?;
|
||||
let page = crate::tenant::with_ondemand_download(|| {
|
||||
// FIXME: this profiling now happens at different place than it used to. The
|
||||
// current profiling is based on a thread-local variable, so it doesn't work
|
||||
// across awaits
|
||||
let _profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
|
||||
timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
@@ -649,7 +656,7 @@ impl PageServerHandler {
|
||||
tokio::task::block_in_place(|| {
|
||||
let basebackup =
|
||||
basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
|
||||
tracing::Span::current().record("lsn", &basebackup.lsn.to_string().as_str());
|
||||
tracing::Span::current().record("lsn", basebackup.lsn.to_string().as_str());
|
||||
basebackup.send_tarball()
|
||||
})?;
|
||||
pgb.write_message(&BeMessage::CopyDone)?;
|
||||
@@ -941,7 +948,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
/// ensures that queries don't fail immediately after pageserver startup, because
|
||||
/// all tenants are still loading.
|
||||
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
|
||||
Ok(wait_result) => wait_result
|
||||
// no .context(), the error message is good enough and some tests depend on it
|
||||
|
||||
@@ -6,11 +6,12 @@
|
||||
//! walingest.rs handles a few things like implicit relation creation and extension.
|
||||
//! Clarify that)
|
||||
//!
|
||||
use super::tenant::PageReconstructResult;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::repository::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::{with_ondemand_download, Timeline};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use crate::{repository::*, try_no_ondemand_download};
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
@@ -19,6 +20,7 @@ use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::Range;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
@@ -33,6 +35,14 @@ pub enum LsnForTimestamp {
|
||||
NoData(Lsn),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum CalculateLogicalSizeError {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
///
|
||||
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
|
||||
/// and other special kinds of files, in a versioned key-value store. The
|
||||
@@ -88,16 +98,18 @@ impl Timeline {
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> Result<Bytes> {
|
||||
ensure!(tag.relnode != 0, "invalid relnode");
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
}
|
||||
|
||||
let nblocks = self.get_rel_size(tag, lsn, latest)?;
|
||||
let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
|
||||
if blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag, blknum, lsn, nblocks
|
||||
);
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
return PageReconstructResult::Success(ZERO_PAGE.clone());
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
@@ -105,38 +117,51 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn, latest: bool) -> Result<usize> {
|
||||
pub fn get_db_size(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> PageReconstructResult<usize> {
|
||||
let mut total_blocks = 0;
|
||||
|
||||
let rels = self.list_rels(spcnode, dbnode, lsn)?;
|
||||
let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));
|
||||
|
||||
for rel in rels {
|
||||
let n_blocks = self.get_rel_size(rel, lsn, latest)?;
|
||||
let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
|
||||
total_blocks += n_blocks as usize;
|
||||
}
|
||||
Ok(total_blocks)
|
||||
PageReconstructResult::Success(total_blocks)
|
||||
}
|
||||
|
||||
/// Get size of a relation file
|
||||
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn, latest: bool) -> Result<BlockNumber> {
|
||||
ensure!(tag.relnode != 0, "invalid relnode");
|
||||
pub fn get_rel_size(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> PageReconstructResult<BlockNumber> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
}
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return Ok(nblocks);
|
||||
return PageReconstructResult::Success(nblocks);
|
||||
}
|
||||
|
||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||
&& !self.get_rel_exists(tag, lsn, latest)?
|
||||
&& !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
|
||||
{
|
||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||
// FSM, and smgrnblocks() on it immediately afterwards,
|
||||
// without extending it. Tolerate that by claiming that
|
||||
// any non-existent FSM fork has size 0.
|
||||
return Ok(0);
|
||||
return PageReconstructResult::Success(0);
|
||||
}
|
||||
|
||||
let key = rel_size_to_key(tag);
|
||||
let mut buf = self.get(key, lsn)?;
|
||||
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
if latest {
|
||||
@@ -149,43 +174,62 @@ impl Timeline {
|
||||
// associated with most recent value of LSN.
|
||||
self.update_cached_rel_size(tag, lsn, nblocks);
|
||||
}
|
||||
Ok(nblocks)
|
||||
PageReconstructResult::Success(nblocks)
|
||||
}
|
||||
|
||||
/// Does relation exist?
|
||||
pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn, _latest: bool) -> Result<bool> {
|
||||
ensure!(tag.relnode != 0, "invalid relnode");
|
||||
pub fn get_rel_exists(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
lsn: Lsn,
|
||||
_latest: bool,
|
||||
) -> PageReconstructResult<bool> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
}
|
||||
|
||||
// first try to lookup relation in cache
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return Ok(true);
|
||||
return PageReconstructResult::Success(true);
|
||||
}
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = self.get(key, lsn)?;
|
||||
let dir = RelDirectory::des(&buf)?;
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
|
||||
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
|
||||
|
||||
Ok(exists)
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
|
||||
PageReconstructResult::Success(exists)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
|
||||
pub fn list_rels(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<HashSet<RelTag>> {
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = self.get(key, lsn)?;
|
||||
let dir = RelDirectory::des(&buf)?;
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
|
||||
let rels: HashSet<RelTag> =
|
||||
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: *relnode,
|
||||
forknum: *forknum,
|
||||
}));
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let rels: HashSet<RelTag> =
|
||||
HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode: *relnode,
|
||||
forknum: *forknum,
|
||||
}));
|
||||
|
||||
Ok(rels)
|
||||
PageReconstructResult::Success(rels)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
@@ -195,7 +239,7 @@ impl Timeline {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
) -> Result<Bytes> {
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
self.get(key, lsn)
|
||||
}
|
||||
@@ -206,21 +250,30 @@ impl Timeline {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<BlockNumber> {
|
||||
) -> PageReconstructResult<BlockNumber> {
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = self.get(key, lsn)?;
|
||||
Ok(buf.get_u32_le())
|
||||
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
|
||||
pub fn get_slru_segment_exists(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<bool> {
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = self.get(key, lsn)?;
|
||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
|
||||
let exists = dir.segments.get(&segno).is_some();
|
||||
Ok(exists)
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.segments.get(&segno).is_some();
|
||||
PageReconstructResult::Success(exists)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Locate LSN, such that all transactions that committed before
|
||||
@@ -230,7 +283,10 @@ impl Timeline {
|
||||
/// so it's not well defined which LSN you get if there were multiple commits
|
||||
/// "in flight" at that point in time.
|
||||
///
|
||||
pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
|
||||
pub fn find_lsn_for_timestamp(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
) -> PageReconstructResult<LsnForTimestamp> {
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
let min_lsn = *gc_cutoff_lsn_guard;
|
||||
let max_lsn = self.get_last_record_lsn();
|
||||
@@ -246,12 +302,12 @@ impl Timeline {
|
||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||
let mid = (high + low) / 2;
|
||||
|
||||
let cmp = self.is_latest_commit_timestamp_ge_than(
|
||||
let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
&mut found_smaller,
|
||||
&mut found_larger,
|
||||
)?;
|
||||
));
|
||||
|
||||
if cmp {
|
||||
high = mid;
|
||||
@@ -263,15 +319,15 @@ impl Timeline {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
// just after importing a cluster.
|
||||
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||
PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
|
||||
}
|
||||
(true, false) => {
|
||||
// Didn't find any commit timestamps larger than the request
|
||||
Ok(LsnForTimestamp::Future(max_lsn))
|
||||
PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
|
||||
}
|
||||
(false, true) => {
|
||||
// Didn't find any commit timestamps smaller than the request
|
||||
Ok(LsnForTimestamp::Past(max_lsn))
|
||||
PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
|
||||
}
|
||||
(true, true) => {
|
||||
// low is the LSN of the first commit record *after* the search_timestamp,
|
||||
@@ -281,7 +337,7 @@ impl Timeline {
|
||||
// Otherwise, if you restore to the returned LSN, the database will
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -299,12 +355,20 @@ impl Timeline {
|
||||
probe_lsn: Lsn,
|
||||
found_smaller: &mut bool,
|
||||
found_larger: &mut bool,
|
||||
) -> Result<bool> {
|
||||
for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
|
||||
let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
|
||||
) -> PageReconstructResult<bool> {
|
||||
for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
|
||||
let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
|
||||
SlruKind::Clog,
|
||||
segno,
|
||||
probe_lsn
|
||||
));
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page =
|
||||
self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
|
||||
let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
|
||||
SlruKind::Clog,
|
||||
segno,
|
||||
blknum,
|
||||
probe_lsn
|
||||
));
|
||||
|
||||
if clog_page.len() == BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
@@ -313,61 +377,75 @@ impl Timeline {
|
||||
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return Ok(true);
|
||||
return PageReconstructResult::Success(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
PageReconstructResult::Success(false)
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
|
||||
pub fn list_slru_segments(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<HashSet<u32>> {
|
||||
// fetch directory entry
|
||||
let key = slru_dir_to_key(kind);
|
||||
|
||||
let buf = self.get(key, lsn)?;
|
||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
Ok(dir.segments)
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.segments),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_relmap_file(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
let key = relmap_file_key(spcnode, dbnode);
|
||||
|
||||
let buf = self.get(key, lsn)?;
|
||||
Ok(buf)
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf)
|
||||
}
|
||||
|
||||
pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
|
||||
pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
|
||||
// fetch directory entry
|
||||
let buf = self.get(DBDIR_KEY, lsn)?;
|
||||
let dir = DbDirectory::des(&buf)?;
|
||||
let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));
|
||||
|
||||
Ok(dir.dbdirs)
|
||||
match DbDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
let key = twophase_file_key(xid);
|
||||
let buf = self.get(key, lsn)?;
|
||||
Ok(buf)
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf)
|
||||
}
|
||||
|
||||
pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
|
||||
pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
|
||||
// fetch directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
|
||||
let dir = TwoPhaseDirectory::des(&buf)?;
|
||||
let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));
|
||||
|
||||
Ok(dir.xids)
|
||||
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.xids),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
self.get(CONTROLFILE_KEY, lsn)
|
||||
}
|
||||
|
||||
pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
self.get(CHECKPOINT_KEY, lsn)
|
||||
}
|
||||
|
||||
@@ -376,16 +454,26 @@ impl Timeline {
|
||||
///
|
||||
/// Only relation blocks are counted currently. That excludes metadata,
|
||||
/// SLRUs, twophase files etc.
|
||||
pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
|
||||
pub async fn get_current_logical_size_non_incremental(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn)?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
let buf = self.get_download(DBDIR_KEY, lsn).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
|
||||
for rel in
|
||||
crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
|
||||
.await?
|
||||
{
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self.get(relsize_key, lsn)?;
|
||||
let mut buf = self.get_download(relsize_key, lsn).await?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
total_size += relsize as u64;
|
||||
@@ -398,7 +486,7 @@ impl Timeline {
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
/// that LSN forwards).
|
||||
pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
|
||||
pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
let mut result = KeySpaceAccum::new();
|
||||
|
||||
@@ -406,8 +494,8 @@ impl Timeline {
|
||||
result.add_key(DBDIR_KEY);
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn)?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
let buf = self.get_download(DBDIR_KEY, lsn).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
|
||||
|
||||
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
||||
dbs.sort_unstable();
|
||||
@@ -415,15 +503,15 @@ impl Timeline {
|
||||
result.add_key(relmap_file_key(spcnode, dbnode));
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> = self
|
||||
.list_rels(spcnode, dbnode, lsn)?
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
let mut rels: Vec<RelTag> =
|
||||
with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
rels.sort_unstable();
|
||||
for rel in rels {
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self.get(relsize_key, lsn)?;
|
||||
let mut buf = self.get_download(relsize_key, lsn).await?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
|
||||
@@ -439,13 +527,13 @@ impl Timeline {
|
||||
] {
|
||||
let slrudir_key = slru_dir_to_key(kind);
|
||||
result.add_key(slrudir_key);
|
||||
let buf = self.get(slrudir_key, lsn)?;
|
||||
let dir = SlruSegmentDirectory::des(&buf)?;
|
||||
let buf = self.get_download(slrudir_key, lsn).await?;
|
||||
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
|
||||
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
|
||||
segments.sort_unstable();
|
||||
for segno in segments {
|
||||
let segsize_key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = self.get(segsize_key, lsn)?;
|
||||
let mut buf = self.get_download(segsize_key, lsn).await?;
|
||||
let segsize = buf.get_u32_le();
|
||||
|
||||
result.add_range(
|
||||
@@ -457,8 +545,8 @@ impl Timeline {
|
||||
|
||||
// Then pg_twophase
|
||||
result.add_key(TWOPHASEDIR_KEY);
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
|
||||
let twophase_dir = TwoPhaseDirectory::des(&buf)?;
|
||||
let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
|
||||
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
|
||||
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
|
||||
xids.sort_unstable();
|
||||
for xid in xids {
|
||||
@@ -537,7 +625,7 @@ impl<'a> DatadirModification<'a> {
|
||||
///
|
||||
/// This inserts the directory metadata entries that are assumed to
|
||||
/// always exist.
|
||||
pub fn init_empty(&mut self) -> Result<()> {
|
||||
pub fn init_empty(&mut self) -> anyhow::Result<()> {
|
||||
let buf = DbDirectory::ser(&DbDirectory {
|
||||
dbdirs: HashMap::new(),
|
||||
})?;
|
||||
@@ -570,8 +658,8 @@ impl<'a> DatadirModification<'a> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> Result<()> {
|
||||
ensure!(rel.relnode != 0, "invalid relnode");
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
|
||||
Ok(())
|
||||
}
|
||||
@@ -583,7 +671,7 @@ impl<'a> DatadirModification<'a> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
self.put(
|
||||
slru_block_to_key(kind, segno, blknum),
|
||||
Value::WalRecord(rec),
|
||||
@@ -597,8 +685,8 @@ impl<'a> DatadirModification<'a> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> Result<()> {
|
||||
ensure!(rel.relnode != 0, "invalid relnode");
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
@@ -609,26 +697,26 @@ impl<'a> DatadirModification<'a> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Store a relmapper file (pg_filenode.map) in the repository
|
||||
pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> {
|
||||
pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY)?;
|
||||
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
|
||||
let mut dbdir = DbDirectory::des(&buf)?;
|
||||
|
||||
let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
|
||||
if r == None || r == Some(false) {
|
||||
if r.is_none() || r == Some(false) {
|
||||
// The dbdir entry didn't exist, or it contained a
|
||||
// 'false'. The 'insert' call already updated it with
|
||||
// 'true', now write the updated 'dbdirs' map back.
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
}
|
||||
if r == None {
|
||||
if r.is_none() {
|
||||
// Create RelDirectory
|
||||
let buf = RelDirectory::ser(&RelDirectory {
|
||||
rels: HashSet::new(),
|
||||
@@ -643,12 +731,12 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> {
|
||||
pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY)?;
|
||||
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
bail!("twophase file for xid {} already exists", xid);
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
}
|
||||
self.put(
|
||||
TWOPHASEDIR_KEY,
|
||||
@@ -659,23 +747,26 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> Result<()> {
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
self.put(CONTROLFILE_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> {
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
self.put(CHECKPOINT_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> {
|
||||
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
|
||||
let req_lsn = self.tline.get_last_record_lsn();
|
||||
|
||||
let total_blocks = self.tline.get_db_size(spcnode, dbnode, req_lsn, true)?;
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, req_lsn, true)
|
||||
.no_ondemand_download()?;
|
||||
|
||||
// Remove entry from dbdir
|
||||
let buf = self.get(DBDIR_KEY)?;
|
||||
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
|
||||
let mut dir = DbDirectory::des(&buf)?;
|
||||
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
|
||||
let buf = DbDirectory::ser(&dir)?;
|
||||
@@ -698,11 +789,11 @@ impl<'a> DatadirModification<'a> {
|
||||
/// Create a relation fork.
|
||||
///
|
||||
/// 'nblocks' is the initial size.
|
||||
pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
|
||||
ensure!(rel.relnode != 0, "invalid relnode");
|
||||
pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?;
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
|
||||
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
|
||||
// Didn't exist. Update dbdir
|
||||
@@ -714,12 +805,12 @@ impl<'a> DatadirModification<'a> {
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key)?)?
|
||||
RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
|
||||
};
|
||||
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
bail!("rel {} already exists", rel);
|
||||
anyhow::bail!("rel {rel} already exists");
|
||||
}
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
@@ -742,13 +833,17 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Truncate relation
|
||||
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
|
||||
ensure!(rel.relnode != 0, "invalid relnode");
|
||||
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
let last_lsn = self.tline.get_last_record_lsn();
|
||||
if self.tline.get_rel_exists(rel, last_lsn, true)? {
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, last_lsn, true)
|
||||
.no_ondemand_download()?
|
||||
{
|
||||
let size_key = rel_size_to_key(rel);
|
||||
// Fetch the old size first
|
||||
let old_size = self.get(size_key)?.get_u32_le();
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
|
||||
// Update the entry with the new size.
|
||||
let buf = nblocks.to_le_bytes();
|
||||
@@ -768,12 +863,12 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
/// Extend relation
|
||||
/// If new size is smaller, do nothing.
|
||||
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
|
||||
ensure!(rel.relnode != 0, "invalid relnode");
|
||||
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key)?.get_u32_le();
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
|
||||
// only extend relation here. never decrease the size
|
||||
if nblocks > old_size {
|
||||
@@ -789,12 +884,12 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Drop a relation.
|
||||
pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> {
|
||||
ensure!(rel.relnode != 0, "invalid relnode");
|
||||
pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
|
||||
// Remove it from the directory entry
|
||||
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let buf = self.get(dir_key)?;
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let mut dir = RelDirectory::des(&buf)?;
|
||||
|
||||
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
||||
@@ -805,7 +900,7 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
// update logical size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key)?.get_u32_le();
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
self.pending_nblocks -= old_size as i64;
|
||||
|
||||
// Remove enty from relation size cache
|
||||
@@ -822,14 +917,14 @@ impl<'a> DatadirModification<'a> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key)?;
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.insert(segno) {
|
||||
bail!("slru segment {:?}/{} already exists", kind, segno);
|
||||
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
||||
}
|
||||
self.put(
|
||||
dir_key,
|
||||
@@ -852,7 +947,7 @@ impl<'a> DatadirModification<'a> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Put size
|
||||
let size_key = slru_segment_size_to_key(kind, segno);
|
||||
let buf = nblocks.to_le_bytes();
|
||||
@@ -861,10 +956,10 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// This method is used for marking truncated SLRU files
|
||||
pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> {
|
||||
pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key)?;
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.remove(&segno) {
|
||||
@@ -882,15 +977,15 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Drop a relmapper file (pg_filenode.map)
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> {
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
|
||||
// TODO
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is used for marking truncated SLRU files
|
||||
pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> {
|
||||
pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY)?;
|
||||
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
|
||||
if !dir.xids.remove(&xid) {
|
||||
@@ -925,7 +1020,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||
/// for bulk import, where you are just loading data pages and won't try to
|
||||
/// modify the same pages twice.
|
||||
pub fn flush(&mut self) -> Result<()> {
|
||||
pub fn flush(&mut self) -> anyhow::Result<()> {
|
||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||
// to scan through the pending_updates list.
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
@@ -936,7 +1031,7 @@ impl<'a> DatadirModification<'a> {
|
||||
let writer = self.tline.writer();
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut result: Result<()> = Ok(());
|
||||
let mut result: anyhow::Result<()> = Ok(());
|
||||
self.pending_updates.retain(|&key, value| {
|
||||
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
|
||||
result = writer.put(key, self.lsn, value);
|
||||
@@ -984,7 +1079,7 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
fn get(&self, key: Key) -> Result<Bytes> {
|
||||
fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
|
||||
// Have we already updated the same key? Read the pending updated
|
||||
// version in that case.
|
||||
//
|
||||
@@ -992,14 +1087,14 @@ impl<'a> DatadirModification<'a> {
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(value) = self.pending_updates.get(&key) {
|
||||
if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
PageReconstructResult::Success(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
bail!("unexpected pending WAL record");
|
||||
PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
|
||||
}
|
||||
} else {
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
@@ -1327,7 +1422,7 @@ fn twophase_key_range(xid: TransactionId) -> Range<Key> {
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: if overflowed { 1 } else { 0 },
|
||||
field5: u8::from(overflowed),
|
||||
field6: next_xid,
|
||||
}
|
||||
}
|
||||
@@ -1354,7 +1449,7 @@ const CHECKPOINT_KEY: Key = Key {
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
@@ -1365,7 +1460,7 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
|
||||
},
|
||||
key.field6,
|
||||
),
|
||||
_ => bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1384,21 +1479,21 @@ pub fn is_rel_vm_block_key(key: Key) -> bool {
|
||||
&& key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
||||
pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
let kind = match key.field2 {
|
||||
0x00 => SlruKind::Clog,
|
||||
0x01 => SlruKind::MultiXactMembers,
|
||||
0x02 => SlruKind::MultiXactOffsets,
|
||||
_ => bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
_ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
|
||||
};
|
||||
let segno = key.field4;
|
||||
let blknum = key.field6;
|
||||
|
||||
(kind, segno, blknum)
|
||||
}
|
||||
_ => bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1413,7 +1508,7 @@ pub fn create_test_timeline(
|
||||
tenant: &crate::tenant::Tenant,
|
||||
timeline_id: utils::id::TimelineId,
|
||||
pg_version: u32,
|
||||
) -> Result<std::sync::Arc<Timeline>> {
|
||||
) -> anyhow::Result<std::sync::Arc<Timeline>> {
|
||||
let tline = tenant
|
||||
.create_empty_timeline(timeline_id, Lsn(8), pg_version)?
|
||||
.initialize()?;
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
//! Helper functions to delete files from remote storage with a RemoteStorage
|
||||
use anyhow::Context;
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
pub(super) async fn delete_layer(
|
||||
storage: &GenericRemoteStorage,
|
||||
local_layer_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
fail::fail_point!("before-delete-layer", |_| {
|
||||
anyhow::bail!("failpoint before-delete-layer")
|
||||
});
|
||||
debug!(
|
||||
"Deleting layer from remote storage: {:?}",
|
||||
local_layer_path.display()
|
||||
);
|
||||
|
||||
let storage_path = storage
|
||||
.remote_object_id(local_layer_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// XXX: If the deletion fails because the object already didn't exist,
|
||||
// it would be good to just issue a warning but consider it success.
|
||||
// https://github.com/neondatabase/neon/issues/2934
|
||||
storage.delete(&storage_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to delete remote layer from storage at '{:?}'",
|
||||
storage_path
|
||||
)
|
||||
})
|
||||
}
|
||||
@@ -1,257 +0,0 @@
|
||||
//! Helper functions to download files from remote storage with a RemoteStorage
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::storage_sync::index::LayerFileMetadata;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::IndexPart;
|
||||
use super::RelativePath;
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
|
||||
///
|
||||
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
|
||||
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
|
||||
///
|
||||
/// Returns the size of the downloaded file.
|
||||
pub async fn download_layer_file<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
path: &'a RelativePath,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let local_path = path.to_local_path(&timeline_path);
|
||||
|
||||
let layer_storage_path = storage.remote_object_id(&local_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
// write(tmp)
|
||||
// fsync(tmp)
|
||||
// rename(tmp, new)
|
||||
// fsync(new)
|
||||
// fsync(parent)
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut download = storage
|
||||
.download(&layer_storage_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
|
||||
)
|
||||
})?;
|
||||
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file.flush().await.with_context(|| {
|
||||
format!(
|
||||
"failed to flush source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
match layer_metadata.file_size() {
|
||||
Some(expected) if expected != bytes_amount => {
|
||||
anyhow::bail!(
|
||||
"According to layer file metadata should had downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
|
||||
temp_file_path.display()
|
||||
);
|
||||
}
|
||||
Some(_) | None => {
|
||||
// matches, or upgrading from an earlier IndexPart version
|
||||
}
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to fsync source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||
});
|
||||
|
||||
fs::rename(&temp_file_path, &local_path).await?;
|
||||
|
||||
fsync_path(&local_path)
|
||||
.await
|
||||
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))?;
|
||||
|
||||
tracing::info!("download complete: {}", local_path.display());
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
pub fn is_temp_download_file(path: &Path) -> bool {
|
||||
let extension = path.extension().map(|pname| {
|
||||
pname
|
||||
.to_str()
|
||||
.expect("paths passed to this function must be valid Rust strings")
|
||||
});
|
||||
match extension {
|
||||
Some(TEMP_DOWNLOAD_EXTENSION) => true,
|
||||
Some(_) => false,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// List timelines of given tenant in remote storage
|
||||
pub async fn list_remote_timelines<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant storage path for local path '{}'",
|
||||
tenant_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let timelines = storage
|
||||
.list_prefixes(Some(&tenant_storage_path))
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
|
||||
)
|
||||
})?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
anyhow::bail!("no timelines found on the remote storage")
|
||||
}
|
||||
|
||||
let mut timeline_ids = HashSet::new();
|
||||
let mut part_downloads = FuturesUnordered::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
let timeline_id: TimelineId = object_name.parse().with_context(|| {
|
||||
format!("failed to parse object name into timeline id '{object_name}'")
|
||||
})?;
|
||||
|
||||
// list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
|
||||
// yet, launch a download task for it.
|
||||
if !timeline_ids.contains(&timeline_id) {
|
||||
timeline_ids.insert(timeline_id);
|
||||
let storage_clone = storage.clone();
|
||||
part_downloads.push(async move {
|
||||
(
|
||||
timeline_id,
|
||||
download_index_part(conf, &storage_clone, tenant_id, timeline_id).await,
|
||||
)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for all the download tasks to complete.
|
||||
let mut timeline_parts = Vec::new();
|
||||
while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
|
||||
let index_part = part_upload_result
|
||||
.with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
|
||||
|
||||
debug!("Successfully fetched index part for timeline {timeline_id}");
|
||||
timeline_parts.push((timeline_id, index_part));
|
||||
}
|
||||
Ok(timeline_parts)
|
||||
}
|
||||
|
||||
pub async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let index_part_path = conf
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let part_storage_path = storage
|
||||
.remote_object_id(&index_part_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
|
||||
let mut index_part_download = storage.download(&part_storage_path).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
tokio::io::copy(
|
||||
&mut index_part_download.download_stream,
|
||||
&mut index_part_bytes,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download an index part into file '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize index part file into file '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
@@ -25,7 +25,6 @@
|
||||
//! the current task has been requested to shut down. You can use that with
|
||||
//! Tokio select!().
|
||||
//!
|
||||
//!
|
||||
//! TODO: This would be a good place to also handle panics in a somewhat sane way.
|
||||
//! Depending on what task panics, we might want to kill the whole server, or
|
||||
//! only a single tenant or timeline.
|
||||
@@ -36,6 +35,7 @@
|
||||
#![allow(clippy::declare_interior_mutable_const)]
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
use std::panic::AssertUnwindSafe;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
@@ -43,9 +43,9 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use futures::FutureExt;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::sync::watch;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::task_local;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
@@ -71,7 +71,7 @@ use crate::shutdown_pageserver;
|
||||
//
|
||||
// WAL receiver runtime:
|
||||
// - used to handle WAL receiver connections.
|
||||
// - and to receiver updates from etcd
|
||||
// - and to receiver updates from storage_broker
|
||||
//
|
||||
// Background runtime
|
||||
// - layer flushing
|
||||
@@ -135,22 +135,28 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
.expect("Failed to create background op runtime")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
impl fmt::Display for PageserverTaskId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
/// Each task that we track is associated with a "task ID". It's just an
|
||||
/// increasing number that we assign. Note that it is different from tokio::task::Id.
|
||||
static NEXT_TASK_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
|
||||
static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);
|
||||
|
||||
/// Global registry of tasks
|
||||
static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
|
||||
Lazy::new(|| Mutex::new(HashMap::new()));
|
||||
|
||||
task_local! {
|
||||
// There is a Tokio watch channel for each task, which can be used to signal the
|
||||
// task that it needs to shut down. This task local variable holds the receiving
|
||||
// end of the channel. The sender is kept in the global registry, so that anyone
|
||||
// can send the signal to request task shutdown.
|
||||
static SHUTDOWN_RX: watch::Receiver<bool>;
|
||||
// This is a cancellation token which will be cancelled when a task needs to shut down. The
|
||||
// root token is kept in the global registry, so that anyone can send the signal to request
|
||||
// task shutdown.
|
||||
static SHUTDOWN_TOKEN: CancellationToken;
|
||||
|
||||
// Each task holds reference to its own PageServerTask here.
|
||||
static CURRENT_TASK: Arc<PageServerTask>;
|
||||
@@ -178,7 +184,7 @@ pub enum TaskKind {
|
||||
PageRequestHandler,
|
||||
|
||||
// Manages the WAL receiver connection for one timeline. It subscribes to
|
||||
// events from etcd, decides which safekeeper to connect to. It spawns a
|
||||
// events from storage_broker, decides which safekeeper to connect to. It spawns a
|
||||
// separate WalReceiverConnection task to handle each connection.
|
||||
WalReceiverManager,
|
||||
|
||||
@@ -200,11 +206,20 @@ pub enum TaskKind {
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// Task that downloads a file from remote storage
|
||||
RemoteDownloadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
// task that handles attaching a tenant
|
||||
Attach,
|
||||
|
||||
// task that handhes metrics collection
|
||||
MetricsCollection,
|
||||
|
||||
// task that drives downloading layers
|
||||
DownloadAllRemoteLayers,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -226,8 +241,8 @@ struct PageServerTask {
|
||||
|
||||
name: String,
|
||||
|
||||
// To request task shutdown, send 'true' to the channel to notify the task.
|
||||
shutdown_tx: watch::Sender<bool>,
|
||||
// To request task shutdown, just cancel this token.
|
||||
cancel: CancellationToken,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
}
|
||||
@@ -247,13 +262,13 @@ pub fn spawn<F>(
|
||||
where
|
||||
F: Future<Output = anyhow::Result<()>> + Send + 'static,
|
||||
{
|
||||
let (shutdown_tx, shutdown_rx) = watch::channel(false);
|
||||
let cancel = CancellationToken::new();
|
||||
let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
|
||||
let task = Arc::new(PageServerTask {
|
||||
task_id: PageserverTaskId(task_id),
|
||||
kind,
|
||||
name: name.to_string(),
|
||||
shutdown_tx,
|
||||
cancel: cancel.clone(),
|
||||
mutable: Mutex::new(MutableTaskState {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
@@ -271,7 +286,7 @@ where
|
||||
task_name,
|
||||
task_id,
|
||||
task_cloned,
|
||||
shutdown_rx,
|
||||
cancel,
|
||||
shutdown_process_on_error,
|
||||
future,
|
||||
));
|
||||
@@ -288,7 +303,7 @@ async fn task_wrapper<F>(
|
||||
task_name: String,
|
||||
task_id: u64,
|
||||
task: Arc<PageServerTask>,
|
||||
shutdown_rx: watch::Receiver<bool>,
|
||||
shutdown_token: CancellationToken,
|
||||
shutdown_process_on_error: bool,
|
||||
future: F,
|
||||
) where
|
||||
@@ -296,9 +311,9 @@ async fn task_wrapper<F>(
|
||||
{
|
||||
debug!("Starting task '{}'", task_name);
|
||||
|
||||
let result = SHUTDOWN_RX
|
||||
let result = SHUTDOWN_TOKEN
|
||||
.scope(
|
||||
shutdown_rx,
|
||||
shutdown_token,
|
||||
CURRENT_TASK.scope(task, {
|
||||
// We use AssertUnwindSafe here so that the payload function
|
||||
// doesn't need to be UnwindSafe. We don't do anything after the
|
||||
@@ -408,7 +423,7 @@ pub async fn shutdown_tasks(
|
||||
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
|
||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
||||
{
|
||||
let _ = task.shutdown_tx.send_replace(true);
|
||||
task.cancel.cancel();
|
||||
victim_tasks.push(Arc::clone(task));
|
||||
}
|
||||
}
|
||||
@@ -436,24 +451,35 @@ pub fn current_task_kind() -> Option<TaskKind> {
|
||||
CURRENT_TASK.try_with(|ct| ct.kind).ok()
|
||||
}
|
||||
|
||||
pub fn current_task_id() -> Option<PageserverTaskId> {
|
||||
CURRENT_TASK.try_with(|ct| ct.task_id).ok()
|
||||
}
|
||||
|
||||
/// A Future that can be used to check if the current task has been requested to
|
||||
/// shut down.
|
||||
pub async fn shutdown_watcher() {
|
||||
let mut shutdown_rx = SHUTDOWN_RX
|
||||
.try_with(|rx| rx.clone())
|
||||
let token = SHUTDOWN_TOKEN
|
||||
.try_with(|t| t.clone())
|
||||
.expect("shutdown_requested() called in an unexpected task or thread");
|
||||
|
||||
while !*shutdown_rx.borrow() {
|
||||
if shutdown_rx.changed().await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
token.cancelled().await;
|
||||
}
|
||||
|
||||
/// Clone the current task's cancellation token, which can be moved across tasks.
|
||||
///
|
||||
/// When the task which is currently executing is shutdown, the cancellation token will be
|
||||
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
|
||||
/// `tokio::task::JoinSet::spawn`.
|
||||
pub fn shutdown_token() -> CancellationToken {
|
||||
SHUTDOWN_TOKEN
|
||||
.try_with(|t| t.clone())
|
||||
.expect("shutdown_token() called in an unexpected task or thread")
|
||||
}
|
||||
|
||||
/// Has the current task been requested to shut down?
|
||||
pub fn is_shutdown_requested() -> bool {
|
||||
if let Ok(shutdown_rx) = SHUTDOWN_RX.try_with(|rx| rx.clone()) {
|
||||
*shutdown_rx.borrow()
|
||||
if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
|
||||
cancel.is_cancelled()
|
||||
} else {
|
||||
if !cfg!(test) {
|
||||
warn!("is_shutdown_requested() called in an unexpected task or thread");
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5,7 +5,6 @@
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::{ReadBufResult, PAGE_SZ};
|
||||
use bytes::Bytes;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
@@ -61,7 +60,7 @@ where
|
||||
///
|
||||
/// ```no_run
|
||||
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
|
||||
/// # let reader: FileBlockReader<std::fs::File> = todo!();
|
||||
/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
|
||||
/// let cursor = reader.block_cursor();
|
||||
/// let buf = cursor.read_blk(1);
|
||||
/// // do stuff with 'buf'
|
||||
@@ -117,7 +116,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
static NEXT_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
|
||||
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
|
||||
|
||||
/// An adapter for reading a (virtual) file using the page cache.
|
||||
///
|
||||
|
||||
@@ -185,14 +185,16 @@ impl TenantConfOpt {
|
||||
if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
|
||||
self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
}
|
||||
if let Some(trace_read_requests) = other.trace_read_requests {
|
||||
self.trace_read_requests = Some(trace_read_requests);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantConf {
|
||||
pub fn default() -> TenantConf {
|
||||
impl Default for TenantConf {
|
||||
fn default() -> Self {
|
||||
use defaults::*;
|
||||
|
||||
TenantConf {
|
||||
Self {
|
||||
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
|
||||
checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
|
||||
.expect("cannot parse default checkpoint timeout"),
|
||||
@@ -217,29 +219,4 @@ impl TenantConf {
|
||||
trace_read_requests: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dummy_conf() -> Self {
|
||||
TenantConf {
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
checkpoint_timeout: Duration::from_secs(600),
|
||||
compaction_target_size: 4 * 1024 * 1024,
|
||||
compaction_period: Duration::from_secs(10),
|
||||
compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||
pitr_interval: Duration::from_secs(60 * 60),
|
||||
walreceiver_connect_timeout: humantime::parse_duration(
|
||||
defaults::DEFAULT_WALRECEIVER_CONNECT_TIMEOUT,
|
||||
)
|
||||
.unwrap(),
|
||||
lagging_wal_timeout: humantime::parse_duration(
|
||||
defaults::DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT,
|
||||
)
|
||||
.unwrap(),
|
||||
max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
|
||||
.unwrap(),
|
||||
trace_read_requests: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -139,7 +139,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
off += keys_len as u64;
|
||||
|
||||
let values_off = off as usize;
|
||||
let values_len = num_children as usize * VALUE_SZ as usize;
|
||||
let values_len = num_children as usize * VALUE_SZ;
|
||||
//off += values_len as u64;
|
||||
|
||||
let prefix = &buf[prefix_off..prefix_off + prefix_len as usize];
|
||||
@@ -177,7 +177,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
while low < high {
|
||||
let mid = low + size / 2;
|
||||
|
||||
let key_off = mid as usize * self.suffix_len as usize;
|
||||
let key_off = mid * self.suffix_len as usize;
|
||||
let suffix = &self.keys[key_off..key_off + self.suffix_len as usize];
|
||||
// Does this match?
|
||||
keybuf[self.prefix_len as usize..].copy_from_slice(suffix);
|
||||
@@ -328,7 +328,7 @@ where
|
||||
while idx < node.num_children as usize {
|
||||
let suffix = &node.keys[key_off..key_off + suffix_len];
|
||||
keybuf[prefix_len..].copy_from_slice(suffix);
|
||||
let value = node.value(idx as usize);
|
||||
let value = node.value(idx);
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if node.level == 0 {
|
||||
// leaf
|
||||
@@ -368,7 +368,7 @@ where
|
||||
key_off -= suffix_len;
|
||||
let suffix = &node.keys[key_off..key_off + suffix_len];
|
||||
keybuf[prefix_len..].copy_from_slice(suffix);
|
||||
let value = node.value(idx as usize);
|
||||
let value = node.value(idx);
|
||||
#[allow(clippy::collapsible_if)]
|
||||
if node.level == 0 {
|
||||
// leaf
|
||||
@@ -629,7 +629,7 @@ impl<const L: usize> BuildNode<L> {
|
||||
self.keys.extend(&key[self.prefix.len()..]);
|
||||
self.values.extend(value.0);
|
||||
|
||||
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
|
||||
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
|
||||
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
|
||||
|
||||
self.size += self.suffix_len + VALUE_SZ;
|
||||
@@ -674,7 +674,7 @@ impl<const L: usize> BuildNode<L> {
|
||||
self.size -= prefix_len * self.num_children as usize;
|
||||
self.size += prefix_len;
|
||||
|
||||
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
|
||||
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
|
||||
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
|
||||
|
||||
true
|
||||
@@ -684,7 +684,7 @@ impl<const L: usize> BuildNode<L> {
|
||||
/// Serialize the node to on-disk format.
|
||||
///
|
||||
fn pack(&self) -> Bytes {
|
||||
assert!(self.keys.len() == self.num_children as usize * self.suffix_len as usize);
|
||||
assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
|
||||
assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
|
||||
assert!(self.num_children > 0);
|
||||
|
||||
@@ -940,7 +940,7 @@ mod tests {
|
||||
let t = -(f64::ln(u));
|
||||
let key_int = (t * 1000000.0) as u128;
|
||||
|
||||
all_data.insert(key_int as u128, idx as u64);
|
||||
all_data.insert(key_int, idx as u64);
|
||||
}
|
||||
|
||||
// Build a tree from it
|
||||
|
||||
@@ -91,7 +91,7 @@ impl EphemeralFile {
|
||||
break;
|
||||
}
|
||||
|
||||
off += n as usize;
|
||||
off += n;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
|
||||
use crate::metrics::NUM_ONDISK_LAYERS;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::inmemory_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use anyhow::Result;
|
||||
use std::collections::VecDeque;
|
||||
@@ -25,8 +25,7 @@ use super::bst_layer_map::RetroactiveLayerMap;
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct LayerMap {
|
||||
pub struct LayerMap<L: ?Sized> {
|
||||
//
|
||||
// 'open_layer' holds the current InMemoryLayer that is accepting new
|
||||
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
|
||||
@@ -47,20 +46,35 @@ pub struct LayerMap {
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
/// Index of the historic layers optimized for search
|
||||
index: RetroactiveLayerMap<Arc<dyn Layer>>,
|
||||
index: RetroactiveLayerMap<Arc<L>>,
|
||||
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: Vec<Arc<dyn Layer>>,
|
||||
l0_delta_layers: Vec<Arc<L>>,
|
||||
}
|
||||
|
||||
impl<L: ?Sized> Default for LayerMap<L> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
open_layer: None,
|
||||
next_open_layer_at: None,
|
||||
frozen_layers: VecDeque::default(),
|
||||
l0_delta_layers: Vec::default(),
|
||||
index: RetroactiveLayerMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<dyn Layer>,
|
||||
pub struct SearchResult<L: ?Sized> {
|
||||
pub layer: Arc<L>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
impl<L> LayerMap<L>
|
||||
where
|
||||
L: ?Sized + Layer,
|
||||
{
|
||||
///
|
||||
/// Find the latest layer that covers the given 'key', with lsn <
|
||||
/// 'end_lsn'.
|
||||
@@ -72,39 +86,39 @@ impl LayerMap {
|
||||
/// contain the version, even if it's missing from the returned
|
||||
/// layer.
|
||||
///
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult>> {
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
|
||||
match self.index.query(key.to_i128(), end_lsn.0 - 1) {
|
||||
(None, None) => Ok(None),
|
||||
(None, None) => None,
|
||||
(None, Some(image)) => {
|
||||
let lsn_floor = image.get_lsn_range().start;
|
||||
Ok(Some(SearchResult {
|
||||
Some(SearchResult {
|
||||
layer: image,
|
||||
lsn_floor,
|
||||
}))
|
||||
})
|
||||
}
|
||||
(Some(delta), None) => {
|
||||
let lsn_floor = delta.get_lsn_range().start;
|
||||
Ok(Some(SearchResult {
|
||||
Some(SearchResult {
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
}))
|
||||
})
|
||||
}
|
||||
(Some(delta), Some(image)) => {
|
||||
let img_lsn = image.get_lsn_range().start;
|
||||
let image_is_newer = image.get_lsn_range().end > delta.get_lsn_range().end;
|
||||
let image_exact_match = Lsn(img_lsn.0 + 1) == end_lsn;
|
||||
if image_is_newer || image_exact_match {
|
||||
Ok(Some(SearchResult {
|
||||
Some(SearchResult {
|
||||
layer: image,
|
||||
lsn_floor: img_lsn,
|
||||
}))
|
||||
})
|
||||
} else {
|
||||
let lsn_floor =
|
||||
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
|
||||
Ok(Some(SearchResult {
|
||||
Some(SearchResult {
|
||||
layer: delta,
|
||||
lsn_floor,
|
||||
}))
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -113,7 +127,7 @@ impl LayerMap {
|
||||
///
|
||||
/// Insert an on-disk layer
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
pub fn insert_historic(&mut self, layer: Arc<L>) {
|
||||
let kr = layer.get_key_range();
|
||||
let lr = layer.get_lsn_range();
|
||||
self.index.insert(
|
||||
@@ -140,7 +154,7 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
pub fn remove_historic(&mut self, layer: Arc<L>) {
|
||||
let kr = layer.get_key_range();
|
||||
let lr = layer.get_lsn_range();
|
||||
self.index.remove(
|
||||
@@ -182,7 +196,7 @@ impl LayerMap {
|
||||
let start = key.start.to_i128();
|
||||
let end = key.end.to_i128();
|
||||
|
||||
let layer_covers = |layer: Option<Arc<dyn Layer>>| match layer {
|
||||
let layer_covers = |layer: Option<Arc<L>>| match layer {
|
||||
Some(layer) => layer.get_lsn_range().start >= lsn.start,
|
||||
None => false,
|
||||
};
|
||||
@@ -202,7 +216,7 @@ impl LayerMap {
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
|
||||
self.index.iter()
|
||||
}
|
||||
|
||||
@@ -218,7 +232,7 @@ impl LayerMap {
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
|
||||
let version = match self.index.get_version(lsn.0 - 1) {
|
||||
Some(v) => v,
|
||||
None => return Ok(vec![]),
|
||||
@@ -228,7 +242,7 @@ impl LayerMap {
|
||||
let end = key_range.end.to_i128();
|
||||
|
||||
// Initialize loop variables
|
||||
let mut coverage: Vec<(Range<Key>, Option<Arc<dyn Layer>>)> = vec![];
|
||||
let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
|
||||
let mut current_key = start.clone();
|
||||
let mut current_val = version.query(start).1;
|
||||
|
||||
@@ -308,7 +322,7 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
|
||||
Ok(self.l0_delta_layers.clone())
|
||||
}
|
||||
|
||||
|
||||
@@ -255,8 +255,7 @@ pub fn save_metadata(
|
||||
// fsync the parent directory to ensure the directory entry is durable
|
||||
if first_save {
|
||||
let timeline_dir = File::open(
|
||||
&path
|
||||
.parent()
|
||||
path.parent()
|
||||
.expect("Metadata should always have a parent dir"),
|
||||
)?;
|
||||
timeline_dir.sync_all()?;
|
||||
|
||||
494
pageserver/src/tenant/mgr.rs
Normal file
494
pageserver/src/tenant/mgr.rs
Normal file
@@ -0,0 +1,494 @@
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server.
|
||||
|
||||
use std::collections::{hash_map, HashMap};
|
||||
use std::ffi::OsStr;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tokio::fs;
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::*;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::crashsafe;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::IGNORED_TENANT_FILE_NAME;
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
|
||||
Lazy::new(|| RwLock::new(HashMap::new()));
|
||||
|
||||
/// Initialize repositories with locally available timelines.
|
||||
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
/// are scheduled for download and added to the tenant once download is completed.
|
||||
#[instrument(skip(conf, remote_storage))]
|
||||
pub async fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Scan local filesystem for attached tenants
|
||||
let mut number_of_tenants = 0;
|
||||
let tenants_dir = conf.tenants_path();
|
||||
|
||||
let mut dir_entries = fs::read_dir(&tenants_dir)
|
||||
.await
|
||||
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
|
||||
|
||||
loop {
|
||||
match dir_entries.next_entry().await {
|
||||
Ok(None) => break,
|
||||
Ok(Some(dir_entry)) => {
|
||||
let tenant_dir_path = dir_entry.path();
|
||||
if crate::is_temporary(&tenant_dir_path) {
|
||||
info!(
|
||||
"Found temporary tenant directory, removing: {}",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
tenant_dir_path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// This case happens if we crash during attach before creating the attach marker file
|
||||
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
|
||||
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
|
||||
})?;
|
||||
if is_empty {
|
||||
info!("removing empty tenant directory {tenant_dir_path:?}");
|
||||
if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
|
||||
error!(
|
||||
"Failed to remove empty tenant directory '{}': {e:#}",
|
||||
tenant_dir_path.display()
|
||||
)
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
continue;
|
||||
}
|
||||
|
||||
match schedule_local_tenant_processing(
|
||||
conf,
|
||||
&tenant_dir_path,
|
||||
remote_storage.clone(),
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
TENANTS.write().await.insert(tenant.tenant_id(), tenant);
|
||||
number_of_tenants += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// On error, print it, but continue with the other tenants. If we error out
|
||||
// here, the pageserver startup fails altogether, causing outage for *all*
|
||||
// tenants. That seems worse.
|
||||
error!(
|
||||
"Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Processed {number_of_tenants} local tenants at startup");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn schedule_local_tenant_processing(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
anyhow::ensure!(
|
||||
tenant_path.is_dir(),
|
||||
"Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
|
||||
);
|
||||
anyhow::ensure!(
|
||||
!crate::is_temporary(tenant_path),
|
||||
"Cannot load tenant from temporary path {tenant_path:?}"
|
||||
);
|
||||
anyhow::ensure!(
|
||||
!tenant_path.is_empty_dir().with_context(|| {
|
||||
format!("Failed to check whether {tenant_path:?} is an empty dir")
|
||||
})?,
|
||||
"Cannot load tenant from empty directory {tenant_path:?}"
|
||||
);
|
||||
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
.with_context(|| {
|
||||
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
|
||||
})?;
|
||||
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
anyhow::ensure!(
|
||||
!conf.tenant_ignore_mark_file_path(tenant_id).exists(),
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
Tenant::spawn_attach(conf, tenant_id, remote_storage)
|
||||
} else {
|
||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||
Tenant::create_broken_tenant(conf, tenant_id)
|
||||
}
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(conf, tenant_id, remote_storage)
|
||||
};
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
pub async fn shutdown_all_tenants() {
|
||||
let tenants_to_shut_down = {
|
||||
let mut m = TENANTS.write().await;
|
||||
let mut tenants_to_shut_down = Vec::with_capacity(m.len());
|
||||
for (_, tenant) in m.drain() {
|
||||
if tenant.is_active() {
|
||||
// updates tenant state, forbidding new GC and compaction iterations from starting
|
||||
tenant.set_stopping();
|
||||
tenants_to_shut_down.push(tenant)
|
||||
}
|
||||
}
|
||||
drop(m);
|
||||
tenants_to_shut_down
|
||||
};
|
||||
|
||||
// Shut down all existing walreceiver connections and stop accepting the new ones.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
|
||||
|
||||
// Ok, no background tasks running anymore. Flush any remaining data in
|
||||
// memory to disk.
|
||||
//
|
||||
// We assume that any incoming connections that might request pages from
|
||||
// the tenant have already been terminated by the caller, so there
|
||||
// should be no more activity in any of the repositories.
|
||||
//
|
||||
// On error, log it but continue with the shutdown for other tenants.
|
||||
for tenant in tenants_to_shut_down {
|
||||
let tenant_id = tenant.tenant_id();
|
||||
debug!("shutdown tenant {tenant_id}");
|
||||
|
||||
if let Err(err) = tenant.freeze_and_flush().await {
|
||||
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Option<Arc<Tenant>>> {
|
||||
match TENANTS.write().await.entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(_) => {
|
||||
debug!("tenant {tenant_id} already exists");
|
||||
Ok(None)
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
// Hold the write_tenants() lock, since all of this is local IO.
|
||||
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
|
||||
let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
|
||||
let created_tenant =
|
||||
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
|
||||
let crated_tenant_id = created_tenant.tenant_id();
|
||||
anyhow::ensure!(
|
||||
tenant_id == crated_tenant_id,
|
||||
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
|
||||
);
|
||||
v.insert(Arc::clone(&created_tenant));
|
||||
Ok(Some(created_tenant))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn update_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("configuring tenant {tenant_id}");
|
||||
get_tenant(tenant_id, true)
|
||||
.await?
|
||||
.update_tenant_config(tenant_conf);
|
||||
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
|
||||
let m = TENANTS.read().await;
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
|
||||
if active_only && !tenant.is_active() {
|
||||
anyhow::bail!(
|
||||
"Tenant {tenant_id} is not active. Current state: {:?}",
|
||||
tenant.current_state()
|
||||
)
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
|
||||
match get_tenant(tenant_id, true).await {
|
||||
Ok(tenant) => {
|
||||
tenant.delete_timeline(timeline_id).await?;
|
||||
}
|
||||
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
fs::remove_dir_all(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to remove local tenant directory {local_tenant_directory:?}")
|
||||
})?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn load_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<()> {
|
||||
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
|
||||
let tenant_path = conf.tenant_path(&tenant_id);
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
std::fs::remove_file(&tenant_ignore_mark)
|
||||
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
|
||||
}
|
||||
|
||||
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
|
||||
vacant_entry.insert(new_tenant);
|
||||
Ok(())
|
||||
}).await
|
||||
}
|
||||
|
||||
pub async fn ignore_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
|
||||
fs::File::create(&ignore_mark_file)
|
||||
.await
|
||||
.context("Failed to create ignore mark file")
|
||||
.and_then(|_| {
|
||||
crashsafe::fsync_file_and_parent(&ignore_mark_file)
|
||||
.context("Failed to fsync ignore mark file")
|
||||
})
|
||||
.with_context(|| format!("Failed to crate ignore mark for tenant {tenant_id}"))?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
///
|
||||
/// Get list of tenants, for the mgmt API
|
||||
///
|
||||
pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
|
||||
TENANTS
|
||||
.read()
|
||||
.await
|
||||
.iter()
|
||||
.map(|(id, tenant)| (*id, tenant.current_state()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Execute Attach mgmt API command.
|
||||
///
|
||||
/// Downloading all the tenant data is performed in the background, this merely
|
||||
/// spawns the background task and returns quickly.
|
||||
pub async fn attach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
|
||||
let tenant_path = conf.tenant_path(&tenant_id);
|
||||
anyhow::ensure!(
|
||||
!tenant_path.exists(),
|
||||
"Cannot attach tenant {tenant_id}, local tenant directory already exists"
|
||||
);
|
||||
|
||||
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
|
||||
vacant_entry.insert(tenant);
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
|
||||
where
|
||||
F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
|
||||
{
|
||||
match TENANTS.write().await.entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(e) => {
|
||||
anyhow::bail!(
|
||||
"tenant {tenant_id} already exists, state: {:?}",
|
||||
e.get().current_state()
|
||||
)
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => run(v),
|
||||
}
|
||||
}
|
||||
|
||||
/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
|
||||
/// Allows to remove other tenant resources manually, via `tenant_cleanup`.
|
||||
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
|
||||
/// operation would be needed to remove it.
|
||||
async fn remove_tenant_from_memory<V, F>(
|
||||
tenant_id: TenantId,
|
||||
tenant_cleanup: F,
|
||||
) -> anyhow::Result<V>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
// It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
|
||||
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
||||
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
||||
// avoid holding the lock for the entire process.
|
||||
{
|
||||
let tenants_accessor = TENANTS.write().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => match tenant.current_state() {
|
||||
TenantState::Attaching
|
||||
| TenantState::Loading
|
||||
| TenantState::Broken
|
||||
| TenantState::Active => tenant.set_stopping(),
|
||||
TenantState::Stopping => {
|
||||
anyhow::bail!("Tenant {tenant_id} is stopping already")
|
||||
}
|
||||
},
|
||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||
}
|
||||
}
|
||||
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
||||
// No new tasks will be started for this tenant because it's in `Stopping` state.
|
||||
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
||||
|
||||
match tenant_cleanup
|
||||
.await
|
||||
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
||||
{
|
||||
Ok(hook_value) => {
|
||||
let mut tenants_accessor = TENANTS.write().await;
|
||||
if tenants_accessor.remove(&tenant_id).is_none() {
|
||||
warn!("Tenant {tenant_id} got removed from memory before operation finished");
|
||||
}
|
||||
Ok(hook_value)
|
||||
}
|
||||
Err(e) => {
|
||||
let tenants_accessor = TENANTS.read().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => tenant.set_broken(),
|
||||
None => warn!("Tenant {tenant_id} got removed from memory"),
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub async fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
||||
let guard = TENANTS.read().await;
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
// Run in task_mgr to avoid race with detach operation
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send gc result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
);
|
||||
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
@@ -32,7 +32,8 @@
|
||||
//! the corresponding remote operation with the timeline's [`RemoteTimelineClient`]:
|
||||
//!
|
||||
//! - [`RemoteTimelineClient::schedule_layer_file_upload`] when we've created a new layer file.
|
||||
//! - [`RemoteTimelineClient::schedule_index_upload`] when we've updated the timeline metadata file.
|
||||
//! - [`RemoteTimelineClient::schedule_index_upload_for_metadata_update`] when we've updated the timeline metadata file.
|
||||
//! - [`RemoteTimelineClient::schedule_index_upload_for_file_changes`] to upload an updated index file, after we've scheduled file uploads
|
||||
//! - [`RemoteTimelineClient::schedule_layer_file_deletion`] when we've deleted one or more layer files.
|
||||
//!
|
||||
//! Internally, these functions create [`UploadOp`]s and put them in a queue.
|
||||
@@ -57,7 +58,7 @@
|
||||
//! To have a consistent remote structure, it's important that uploads and
|
||||
//! deletions are performed in the right order. For example, the index file
|
||||
//! contains a list of layer files, so it must not be uploaded until all the
|
||||
//! layer files that are in its list have been succesfully uploaded.
|
||||
//! layer files that are in its list have been successfully uploaded.
|
||||
//!
|
||||
//! The contract between client and its user is that the user is responsible of
|
||||
//! scheduling operations in an order that keeps the remote consistent as
|
||||
@@ -139,7 +140,7 @@
|
||||
//! Note that if we crash during file deletion between the index update
|
||||
//! that removes the file from the list of files, and deleting the remote file,
|
||||
//! the file is leaked in the remote storage. Similarly, if a new file is created
|
||||
//! and uploaded, but the pageserver dies permantently before updating the
|
||||
//! and uploaded, but the pageserver dies permanently before updating the
|
||||
//! remote index file, the new file is leaked in remote storage. We accept and
|
||||
//! tolerate that for now.
|
||||
//! Note further that we cannot easily fix this by scheduling deletes for every
|
||||
@@ -147,31 +148,43 @@
|
||||
//! following two cases:
|
||||
//! - (1) We had the file locally, deleted it locally, scheduled a remote delete,
|
||||
//! but crashed before it finished remotely.
|
||||
//! - (2) We never had the file locally because we were still in tenant attach
|
||||
//! when we crashed. (Similar case for on-demand download in the future.)
|
||||
//! - (2) We never had the file locally because we haven't on-demand downloaded
|
||||
//! it yet.
|
||||
//!
|
||||
//! # Downloads (= Tenant Attach)
|
||||
//! # Downloads
|
||||
//!
|
||||
//! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
|
||||
//! downloading files from the remote storage. Downloads are performed immediately,
|
||||
//! independently of the uploads.
|
||||
//! downloading files from the remote storage. Downloads are performed immediately
|
||||
//! against the `RemoteStorage`, independently of the upload queue.
|
||||
//!
|
||||
//! When we attach a tenant, we perform the following steps:
|
||||
//! - create `Tenant` object in `TenantState::Attaching` state
|
||||
//! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
|
||||
//! - For each timeline, create `Timeline` struct and a `RemoteTimelineClient`, and initialize the client's upload queue with its `IndexPart`
|
||||
//! - eagerly download all the remote layers using the client's download APIs
|
||||
//! - transition tenant from `TenantState::Attaching` to `TenantState::Active` state.
|
||||
//! - List timelines that are present in remote storage, and for each:
|
||||
//! - download their remote [`IndexPart`]s
|
||||
//! - create `Timeline` struct and a `RemoteTimelineClient`
|
||||
//! - initialize the client's upload queue with its `IndexPart`
|
||||
//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
|
||||
//! but not present locally
|
||||
//! - schedule uploads for layers that are only present locally.
|
||||
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
||||
//! the local filesystem, write the remote metadata to the local filesystem
|
||||
//! - After the above is done for each timeline, open the tenant for business by
|
||||
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
|
||||
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
|
||||
//!
|
||||
//! Most of the above happens in [`Timeline::reconcile_with_remote`].
|
||||
//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
|
||||
//! We keep track of the fact that a client is in `Attaching` state in a marker
|
||||
//! file on the local disk.
|
||||
//! However, the distinction is moot for storage sync since we call
|
||||
//! `reconcile_with_remote` for tenants both with and without the marker file.
|
||||
//!
|
||||
//! In the future, downloading will be done on-demand and `reconcile_with_remote`
|
||||
//! will only be responsible for re-scheduling upload ops after a crash of an
|
||||
//! `Active` tenant.
|
||||
//! file on the local disk. This is critical because, when we restart the pageserver,
|
||||
//! we do not want to do the `List timelines` step for each tenant that has already
|
||||
//! been successfully attached (for performance & cost reasons).
|
||||
//! Instead, for a tenant without the attach marker file, we assume that the
|
||||
//! local state is in sync or ahead of the remote state. This includes the list
|
||||
//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
|
||||
//! if there's a timeline on the remote that the pageserver doesn't know about,
|
||||
//! the GC will not consider its branch point, leading to data loss.
|
||||
//! So, for a tenant with the attach marker file, we know that we do not yet have
|
||||
//! persisted all the remote timeline's metadata files locally. To exclude the
|
||||
//! risk above, we re-run the procedure for such tenants
|
||||
//!
|
||||
//! # Operating Without Remote Storage
|
||||
//!
|
||||
@@ -194,39 +207,51 @@ mod upload;
|
||||
// re-export these
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
use std::ops::DerefMut;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use anyhow::ensure;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use std::ops::DerefMut;
|
||||
use tokio::runtime::Runtime;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing::{debug, info, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use self::index::IndexPart;
|
||||
|
||||
use crate::metrics::MeasureRemoteOp;
|
||||
use crate::metrics::RemoteOpFileKind;
|
||||
use crate::metrics::RemoteOpKind;
|
||||
use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
|
||||
use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
storage_sync::index::{LayerFileMetadata, RelativePath},
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::BACKGROUND_RUNTIME,
|
||||
tenant::metadata::TimelineMetadata,
|
||||
tenant::upload_queue::{
|
||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||
},
|
||||
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
|
||||
};
|
||||
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use self::index::IndexPart;
|
||||
|
||||
use super::storage_layer::LayerFileName;
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a download fails, we log it at info-level, and retry.
|
||||
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
||||
// level instead, as repeated failures can mean a more serious problem. If it
|
||||
// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
|
||||
const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
const FAILED_DOWNLOAD_RETRIES: u32 = 10;
|
||||
|
||||
// Similarly log failed uploads and deletions at WARN level, after this many
|
||||
// retries. Uploads and deletions are retried forever, though.
|
||||
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
/// A client for accessing a timeline's data in remote storage.
|
||||
///
|
||||
/// This takes care of managing the number of connections, and balancing them
|
||||
@@ -256,207 +281,42 @@ pub struct RemoteTimelineClient {
|
||||
|
||||
upload_queue: Mutex<UploadQueue>,
|
||||
|
||||
metrics: Arc<RemoteTimelineClientMetrics>,
|
||||
|
||||
storage_impl: GenericRemoteStorage,
|
||||
}
|
||||
|
||||
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
|
||||
// memory for Uninitialized variants. Doesn't matter in practice, there are not
|
||||
// that many upload queues in a running pageserver, and most of them are initialized
|
||||
// anyway.
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
enum UploadQueue {
|
||||
Uninitialized,
|
||||
Initialized(UploadQueueInitialized),
|
||||
Stopped(UploadQueueStopped),
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => "Uninitialized",
|
||||
UploadQueue::Initialized(_) => "Initialized",
|
||||
UploadQueue::Stopped(_) => "Stopped",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This keeps track of queued and in-progress tasks.
|
||||
struct UploadQueueInitialized {
|
||||
/// Counter to assign task IDs
|
||||
task_counter: u64,
|
||||
|
||||
/// All layer files stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations
|
||||
latest_files: HashMap<RelativePath, LayerFileMetadata>,
|
||||
|
||||
/// Metadata stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations.
|
||||
/// DANGER: do not return to outside world, e.g., safekeepers.
|
||||
latest_metadata: TimelineMetadata,
|
||||
|
||||
/// `disk_consistent_lsn` from the last metadata file that was successfully
|
||||
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
|
||||
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
|
||||
/// Safekeeper can rely on it to make decisions for WAL storage.
|
||||
last_uploaded_consistent_lsn: Lsn,
|
||||
|
||||
// Breakdown of different kinds of tasks currently in-progress
|
||||
num_inprogress_layer_uploads: usize,
|
||||
num_inprogress_metadata_uploads: usize,
|
||||
num_inprogress_deletions: usize,
|
||||
|
||||
/// Tasks that are currently in-progress. In-progress means that a tokio Task
|
||||
/// has been launched for it. An in-progress task can be busy uploading, but it can
|
||||
/// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
|
||||
/// be waiting for retry in `exponential_backoff`.
|
||||
inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
|
||||
|
||||
/// Queued operations that have not been launched yet. They might depend on previous
|
||||
/// tasks to finish. For example, metadata upload cannot be performed before all
|
||||
/// preceding layer file uploads have completed.
|
||||
queued_operations: VecDeque<UploadOp>,
|
||||
}
|
||||
|
||||
struct UploadQueueStopped {
|
||||
last_uploaded_consistent_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
fn initialize_empty_remote(
|
||||
&mut self,
|
||||
metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => (),
|
||||
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("already initialized, state {}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
info!("initializing upload queue for empty remote");
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
|
||||
latest_files: Default::default(),
|
||||
latest_metadata: metadata.clone(),
|
||||
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
|
||||
// safekeepers from garbage-collecting anything.
|
||||
last_uploaded_consistent_lsn: Lsn(0),
|
||||
// what follows are boring default initializations
|
||||
task_counter: Default::default(),
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: Default::default(),
|
||||
queued_operations: Default::default(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
Ok(self.initialized_mut().expect("we just set it"))
|
||||
}
|
||||
|
||||
fn initialize_with_current_remote_index_part(
|
||||
&mut self,
|
||||
index_part: &IndexPart,
|
||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => (),
|
||||
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("already initialized, state {}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
let mut files = HashMap::new();
|
||||
for path in &index_part.timeline_layers {
|
||||
let layer_metadata = index_part
|
||||
.layer_metadata
|
||||
.get(path)
|
||||
.map(LayerFileMetadata::from)
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
files.insert(path.clone(), layer_metadata);
|
||||
}
|
||||
|
||||
let index_part_metadata = index_part.parse_metadata()?;
|
||||
info!(
|
||||
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
|
||||
index_part_metadata.disk_consistent_lsn()
|
||||
);
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
latest_files: files,
|
||||
latest_metadata: index_part_metadata.clone(),
|
||||
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: Default::default(),
|
||||
queued_operations: Default::default(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
Ok(self.initialized_mut().expect("we just set it"))
|
||||
}
|
||||
|
||||
fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(x) => Ok(x),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An in-progress upload or delete task.
|
||||
#[derive(Debug)]
|
||||
struct UploadTask {
|
||||
/// Unique ID of this task. Used as the key in `inprogress_tasks` above.
|
||||
task_id: u64,
|
||||
retries: AtomicU32,
|
||||
|
||||
op: UploadOp,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum UploadOp {
|
||||
/// Upload a layer file
|
||||
UploadLayer(PathBuf, LayerFileMetadata),
|
||||
|
||||
/// Upload the metadata file
|
||||
UploadMetadata(IndexPart, Lsn),
|
||||
|
||||
/// Delete a file.
|
||||
Delete(RemoteOpFileKind, PathBuf),
|
||||
|
||||
/// Barrier. When the barrier operation is reached,
|
||||
Barrier(tokio::sync::watch::Sender<()>),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for UploadOp {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
UploadOp::UploadLayer(path, metadata) => write!(
|
||||
f,
|
||||
"UploadLayer({}, size={:?})",
|
||||
path.display(),
|
||||
metadata.file_size()
|
||||
),
|
||||
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
|
||||
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.display()),
|
||||
UploadOp::Barrier(_) => write!(f, "Barrier"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineClient {
|
||||
///
|
||||
/// Create a remote storage client for given timeline
|
||||
///
|
||||
/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
|
||||
/// by calling init_upload_queue.
|
||||
///
|
||||
pub fn new(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<RemoteTimelineClient> {
|
||||
Ok(RemoteTimelineClient {
|
||||
conf,
|
||||
runtime: &BACKGROUND_RUNTIME,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
storage_impl: remote_storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
|
||||
})
|
||||
}
|
||||
|
||||
/// Initialize the upload queue for a remote storage that already received
|
||||
/// an index file upload, i.e., it's not empty.
|
||||
/// The given `index_part` must be the one on the remote.
|
||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -468,6 +328,7 @@ impl RemoteTimelineClient {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -479,6 +340,24 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
|
||||
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
|
||||
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
|
||||
current_remote_index_part
|
||||
.layer_metadata
|
||||
.values()
|
||||
// If we don't have the file size for the layer, don't account for it in the metric.
|
||||
.map(|ilmd| ilmd.file_size.unwrap_or(0))
|
||||
.sum()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
self.metrics.remote_physical_size_gauge().set(size);
|
||||
}
|
||||
|
||||
pub fn get_remote_physical_size(&self) -> u64 {
|
||||
self.metrics.remote_physical_size_gauge().get()
|
||||
}
|
||||
|
||||
//
|
||||
// Download operations.
|
||||
//
|
||||
@@ -499,6 +378,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Index,
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -510,7 +390,7 @@ impl RemoteTimelineClient {
|
||||
/// On success, returns the size of the downloaded file.
|
||||
pub async fn download_layer_file(
|
||||
&self,
|
||||
path: &RelativePath,
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<u64> {
|
||||
let downloaded_size = download::download_layer_file(
|
||||
@@ -518,7 +398,7 @@ impl RemoteTimelineClient {
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
path,
|
||||
layer_file_name,
|
||||
layer_metadata,
|
||||
)
|
||||
.measure_remote_op(
|
||||
@@ -526,6 +406,7 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Download,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -536,13 +417,23 @@ impl RemoteTimelineClient {
|
||||
let new_metadata = LayerFileMetadata::new(downloaded_size);
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
if let Some(upgraded) = upload_queue.latest_files.get_mut(path) {
|
||||
upgraded.merge(&new_metadata);
|
||||
if let Some(upgraded) = upload_queue.latest_files.get_mut(layer_file_name) {
|
||||
if upgraded.merge(&new_metadata) {
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
}
|
||||
// If we don't do an index file upload inbetween here and restart,
|
||||
// the value will go back down after pageserver restart, since we will
|
||||
// have lost this data point.
|
||||
// But, we upload index part fairly frequently, and restart pageserver rarely.
|
||||
// So, by accounting eagerly, we present a most-of-the-time-more-accurate value sooner.
|
||||
self.metrics
|
||||
.remote_physical_size_gauge()
|
||||
.add(downloaded_size);
|
||||
} else {
|
||||
// The file should exist, since we just downloaded it.
|
||||
warn!(
|
||||
"downloaded file {:?} not found in local copy of the index file",
|
||||
path
|
||||
layer_file_name
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -554,14 +445,20 @@ impl RemoteTimelineClient {
|
||||
//
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background.
|
||||
/// Launch an index-file upload operation in the background, with
|
||||
/// updated metadata.
|
||||
///
|
||||
/// The upload will be added to the queue immediately, but it
|
||||
/// won't be performed until all previosuly scheduled layer file
|
||||
/// upload operations have completed successfully. This is to
|
||||
/// ensure that when the index file claims that layers X, Y and Z
|
||||
/// exist in remote storage, they really do.
|
||||
pub fn schedule_index_upload(
|
||||
/// exist in remote storage, they really do. To wait for the upload
|
||||
/// to complete, use `wait_completion`.
|
||||
///
|
||||
/// If there were any changes to the list of files, i.e. if any
|
||||
/// layer file uploads were scheduled, since the last index file
|
||||
/// upload, those will be included too.
|
||||
pub fn schedule_index_upload_for_metadata_update(
|
||||
self: &Arc<Self>,
|
||||
metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -572,26 +469,60 @@ impl RemoteTimelineClient {
|
||||
// ahead of what's _actually_ on the remote during index upload.
|
||||
upload_queue.latest_metadata = metadata.clone();
|
||||
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
/// Use this function to schedule the update of the index file after
|
||||
/// scheduling file uploads or deletions. If no file uploads or deletions
|
||||
/// have been scheduled since the last index file upload, this does
|
||||
/// nothing.
|
||||
///
|
||||
/// Like schedule_index_upload_for_metadata_update(), this merely adds
|
||||
/// the upload to the upload queue and returns quickly.
|
||||
pub fn schedule_index_upload_for_file_changes(self: &Arc<Self>) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background (internal function)
|
||||
fn schedule_index_upload(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
metadata_bytes: Vec<u8>,
|
||||
) {
|
||||
info!(
|
||||
"scheduling metadata upload with {} files ({} changed)",
|
||||
upload_queue.latest_files.len(),
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
);
|
||||
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
upload_queue.latest_metadata.to_bytes()?,
|
||||
metadata_bytes,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
info!(
|
||||
"scheduled metadata upload with {} files",
|
||||
upload_queue.latest_files.len()
|
||||
);
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
|
||||
|
||||
// Launch the task immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -599,7 +530,7 @@ impl RemoteTimelineClient {
|
||||
///
|
||||
pub fn schedule_layer_file_upload(
|
||||
self: &Arc<Self>,
|
||||
path: &Path,
|
||||
layer_file_name: &LayerFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
@@ -612,20 +543,19 @@ impl RemoteTimelineClient {
|
||||
"file size not initialized in metadata"
|
||||
);
|
||||
|
||||
let relative_path = RelativePath::from_local_path(
|
||||
&self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
path,
|
||||
)?;
|
||||
|
||||
upload_queue
|
||||
.latest_files
|
||||
.insert(relative_path, layer_metadata.clone());
|
||||
.insert(layer_file_name.clone(), layer_metadata.clone());
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
|
||||
let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone());
|
||||
let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
|
||||
info!("scheduled layer file upload {}", path.display());
|
||||
info!(
|
||||
"scheduled layer file upload {}",
|
||||
layer_file_name.file_name()
|
||||
);
|
||||
|
||||
// Launch the task immediately, if possible
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
@@ -635,25 +565,21 @@ impl RemoteTimelineClient {
|
||||
///
|
||||
/// Launch a delete operation in the background.
|
||||
///
|
||||
/// The deletion won't actually be performed, until all preceding
|
||||
/// upload operations have completed succesfully.
|
||||
pub fn schedule_layer_file_deletion(self: &Arc<Self>, paths: &[PathBuf]) -> anyhow::Result<()> {
|
||||
/// Note: This schedules an index file upload before the deletions. The
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// succesfully.
|
||||
///
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
// Convert the paths into RelativePaths, and gather other information we need.
|
||||
let mut relative_paths = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
relative_paths.push(RelativePath::from_local_path(
|
||||
&self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
path,
|
||||
)?);
|
||||
}
|
||||
|
||||
// Deleting layers doesn't affect the values stored in TimelineMetadata,
|
||||
// so we don't need update it. Just serialize it.
|
||||
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
|
||||
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
|
||||
|
||||
// Update the remote index file, removing the to-be-deleted files from the index,
|
||||
// before deleting the actual files.
|
||||
@@ -663,25 +589,21 @@ impl RemoteTimelineClient {
|
||||
// from latest_files, but not yet scheduled for deletion. Use a closure
|
||||
// to syntactically forbid ? or bail! calls here.
|
||||
let no_bail_here = || {
|
||||
for relative_path in relative_paths {
|
||||
upload_queue.latest_files.remove(&relative_path);
|
||||
for name in names {
|
||||
upload_queue.latest_files.remove(name);
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
|
||||
}
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
upload_queue.latest_files.clone(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
|
||||
self.schedule_index_upload(upload_queue, metadata_bytes);
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
for path in paths {
|
||||
let op = UploadOp::Delete(RemoteOpFileKind::Layer, PathBuf::from(path));
|
||||
for name in names {
|
||||
let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
|
||||
self.update_upload_queue_unfinished_metric(1, &op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
info!("scheduled layer file deletion {}", path.display());
|
||||
info!("scheduled layer file deletion {}", name.file_name());
|
||||
}
|
||||
|
||||
// Launch the tasks immediately, if possible
|
||||
@@ -753,7 +675,7 @@ impl RemoteTimelineClient {
|
||||
// We can launch this task. Remove it from the queue first.
|
||||
let next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||
|
||||
info!("starting op: {}", next_op);
|
||||
debug!("starting op: {}", next_op);
|
||||
|
||||
// Update the counters
|
||||
match next_op {
|
||||
@@ -837,18 +759,28 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref path, ref layer_metadata) => {
|
||||
upload::upload_timeline_layer(&self.storage_impl, path, layer_metadata)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
)
|
||||
.await
|
||||
UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
||||
.join(layer_file_name.file_name());
|
||||
upload::upload_timeline_layer(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
path,
|
||||
layer_metadata,
|
||||
)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Layer,
|
||||
RemoteOpKind::Upload,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
UploadOp::UploadMetadata(ref index_part, _lsn) => {
|
||||
upload::upload_index_part(
|
||||
let res = upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
self.tenant_id,
|
||||
@@ -860,16 +792,26 @@ impl RemoteTimelineClient {
|
||||
self.timeline_id,
|
||||
RemoteOpFileKind::Index,
|
||||
RemoteOpKind::Upload,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
}
|
||||
res
|
||||
}
|
||||
UploadOp::Delete(metric_file_kind, ref path) => {
|
||||
delete::delete_layer(&self.storage_impl, path)
|
||||
UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
|
||||
let path = &self
|
||||
.conf
|
||||
.timeline_path(&self.timeline_id, &self.tenant_id)
|
||||
.join(layer_file_name.file_name());
|
||||
delete::delete_layer(self.conf, &self.storage_impl, path)
|
||||
.measure_remote_op(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
*metric_file_kind,
|
||||
RemoteOpKind::Delete,
|
||||
Arc::clone(&self.metrics),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -888,10 +830,22 @@ impl RemoteTimelineClient {
|
||||
Err(e) => {
|
||||
let retries = task.retries.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
error!(
|
||||
"failed to perform remote task {}, will retry (attempt {}): {:?}",
|
||||
task.op, retries, e
|
||||
);
|
||||
// Uploads can fail due to rate limits (IAM, S3), spurious network problems,
|
||||
// or other external reasons. Such issues are relatively regular, so log them
|
||||
// at info level at first, and only WARN if the operation fails repeatedly.
|
||||
//
|
||||
// (See similar logic for downloads in `download::download_retry`)
|
||||
if retries < FAILED_UPLOAD_WARN_THRESHOLD {
|
||||
info!(
|
||||
"failed to perform remote task {}, will retry (attempt {}): {:#}",
|
||||
task.op, retries, e
|
||||
);
|
||||
} else {
|
||||
warn!(
|
||||
"failed to perform remote task {}, will retry (attempt {}): {:?}",
|
||||
task.op, retries, e
|
||||
);
|
||||
}
|
||||
|
||||
// sleep until it's time to retry, or we're cancelled
|
||||
tokio::select! {
|
||||
@@ -913,7 +867,7 @@ impl RemoteTimelineClient {
|
||||
task.op, retries
|
||||
);
|
||||
} else {
|
||||
info!("remote task {} completed successfully", task.op);
|
||||
debug!("remote task {} completed successfully", task.op);
|
||||
}
|
||||
|
||||
// The task has completed succesfully. Remove it from the in-progress list.
|
||||
@@ -960,14 +914,8 @@ impl RemoteTimelineClient {
|
||||
return;
|
||||
}
|
||||
};
|
||||
REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
file_kind.as_str(),
|
||||
op_kind.as_str(),
|
||||
])
|
||||
.unwrap()
|
||||
self.metrics
|
||||
.unfinished_tasks(&file_kind, &op_kind)
|
||||
.add(delta)
|
||||
}
|
||||
|
||||
@@ -1032,34 +980,12 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Create a remote storage client for given timeline
|
||||
///
|
||||
/// Note: the caller must initialize the upload queue before any uploads can be scheduled,
|
||||
/// by calling init_upload_queue.
|
||||
///
|
||||
pub fn create_remote_timeline_client(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<RemoteTimelineClient> {
|
||||
Ok(RemoteTimelineClient {
|
||||
conf,
|
||||
runtime: &BACKGROUND_RUNTIME,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
storage_impl: remote_storage,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
use std::collections::HashSet;
|
||||
use std::{collections::HashSet, path::Path};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
|
||||
@@ -1083,15 +1009,11 @@ mod tests {
|
||||
TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
|
||||
}
|
||||
|
||||
fn assert_file_list(a: &HashSet<RelativePath>, b: &[&str]) {
|
||||
let xx = PathBuf::from("");
|
||||
let mut avec: Vec<String> = a
|
||||
.iter()
|
||||
.map(|x| x.to_local_path(&xx).to_string_lossy().into())
|
||||
.collect();
|
||||
fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
|
||||
let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
|
||||
avec.sort();
|
||||
|
||||
let mut bvec = b.to_owned();
|
||||
let mut bvec = b.to_vec();
|
||||
bvec.sort_unstable();
|
||||
|
||||
assert_eq!(avec, bvec);
|
||||
@@ -1159,8 +1081,7 @@ mod tests {
|
||||
|
||||
println!("workdir: {}", harness.conf.workdir.display());
|
||||
|
||||
let storage_impl =
|
||||
GenericRemoteStorage::from_config(harness.conf.workdir.clone(), &storage_config)?;
|
||||
let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
|
||||
let client = Arc::new(RemoteTimelineClient {
|
||||
conf: harness.conf,
|
||||
runtime,
|
||||
@@ -1168,6 +1089,10 @@ mod tests {
|
||||
timeline_id: TIMELINE_ID,
|
||||
storage_impl,
|
||||
upload_queue: Mutex::new(UploadQueue::Uninitialized),
|
||||
metrics: Arc::new(RemoteTimelineClientMetrics::new(
|
||||
&harness.tenant_id,
|
||||
&TIMELINE_ID,
|
||||
)),
|
||||
});
|
||||
|
||||
let remote_timeline_dir =
|
||||
@@ -1184,11 +1109,11 @@ mod tests {
|
||||
std::fs::write(timeline_path.join("bar"), &content_bar)?;
|
||||
|
||||
client.schedule_layer_file_upload(
|
||||
&timeline_path.join("foo"),
|
||||
&LayerFileName::Test("foo".to_owned()),
|
||||
&LayerFileMetadata::new(content_foo.len() as u64),
|
||||
)?;
|
||||
client.schedule_layer_file_upload(
|
||||
&timeline_path.join("bar"),
|
||||
&LayerFileName::Test("bar".to_owned()),
|
||||
&LayerFileMetadata::new(content_bar.len() as u64),
|
||||
)?;
|
||||
|
||||
@@ -1199,15 +1124,19 @@ mod tests {
|
||||
assert!(upload_queue.queued_operations.is_empty());
|
||||
assert!(upload_queue.inprogress_tasks.len() == 2);
|
||||
assert!(upload_queue.num_inprogress_layer_uploads == 2);
|
||||
|
||||
// also check that `latest_file_changes` was updated
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
|
||||
}
|
||||
|
||||
// Schedule upload of index. Check that it is queued
|
||||
let metadata = dummy_metadata(Lsn(0x20));
|
||||
client.schedule_index_upload(&metadata)?;
|
||||
client.schedule_index_upload_for_metadata_update(&metadata)?;
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
assert!(upload_queue.queued_operations.len() == 1);
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
|
||||
}
|
||||
|
||||
// Wait for the uploads to finish
|
||||
@@ -1230,10 +1159,10 @@ mod tests {
|
||||
let content_baz = dummy_contents("baz");
|
||||
std::fs::write(timeline_path.join("baz"), &content_baz)?;
|
||||
client.schedule_layer_file_upload(
|
||||
&timeline_path.join("baz"),
|
||||
&LayerFileName::Test("baz".to_owned()),
|
||||
&LayerFileMetadata::new(content_baz.len() as u64),
|
||||
)?;
|
||||
client.schedule_layer_file_deletion(&[timeline_path.join("foo")])?;
|
||||
client.schedule_layer_file_deletion(&[LayerFileName::Test("foo".to_owned())])?;
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
@@ -1243,6 +1172,7 @@ mod tests {
|
||||
assert!(upload_queue.inprogress_tasks.len() == 1);
|
||||
assert!(upload_queue.num_inprogress_layer_uploads == 1);
|
||||
assert!(upload_queue.num_inprogress_deletions == 0);
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
|
||||
}
|
||||
assert_remote_files(&["foo", "bar", "index_part.json"], &remote_timeline_dir);
|
||||
|
||||
28
pageserver/src/tenant/remote_timeline_client/delete.rs
Normal file
28
pageserver/src/tenant/remote_timeline_client/delete.rs
Normal file
@@ -0,0 +1,28 @@
|
||||
//! Helper functions to delete files from remote storage with a RemoteStorage
|
||||
use anyhow::Context;
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
|
||||
pub(super) async fn delete_layer<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
local_layer_path: &'a Path,
|
||||
) -> anyhow::Result<()> {
|
||||
fail::fail_point!("before-delete-layer", |_| {
|
||||
anyhow::bail!("failpoint before-delete-layer")
|
||||
});
|
||||
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
|
||||
|
||||
let path_to_delete = conf.remote_path(local_layer_path)?;
|
||||
|
||||
// XXX: If the deletion fails because the object already didn't exist,
|
||||
// it would be good to just issue a warning but consider it success.
|
||||
// https://github.com/neondatabase/neon/issues/2934
|
||||
storage.delete(&path_to_delete).await.with_context(|| {
|
||||
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
|
||||
})
|
||||
}
|
||||
331
pageserver/src/tenant/remote_timeline_client/download.rs
Normal file
331
pageserver/src/tenant/remote_timeline_client/download.rs
Normal file
@@ -0,0 +1,331 @@
|
||||
//! Helper functions to download files from remote storage with a RemoteStorage
|
||||
//!
|
||||
//! The functions in this module retry failed operations automatically, according
|
||||
//! to the FAILED_DOWNLOAD_RETRIES constant.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::future::Future;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::{debug, error, info, info_span, warn, Instrument};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::{IndexPart, IndexPartUnclean, LayerFileMetadata};
|
||||
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
|
||||
///
|
||||
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
|
||||
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
|
||||
///
|
||||
/// Returns the size of the downloaded file.
|
||||
pub async fn download_layer_file<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &'a LayerFileName,
|
||||
layer_metadata: &'a LayerFileMetadata,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let local_path = timeline_path.join(layer_file_name.file_name());
|
||||
|
||||
let remote_path = conf
|
||||
.remote_path(&local_path)
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
// write(tmp)
|
||||
// fsync(tmp)
|
||||
// rename(tmp, new)
|
||||
// fsync(new)
|
||||
// fsync(parent)
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let (mut destination_file, bytes_amount) = download_retry(
|
||||
|| async {
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut download = storage.download(&remote_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok((destination_file, bytes_amount))
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
).await?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to flush source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
match layer_metadata.file_size() {
|
||||
Some(expected) if expected != bytes_amount => {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file '{}'",
|
||||
temp_file_path.display()
|
||||
)));
|
||||
}
|
||||
Some(_) | None => {
|
||||
// matches, or upgrading from an earlier IndexPart version
|
||||
}
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to fsync source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
Err(DownloadError::Other(anyhow!(
|
||||
"remote-storage-download-pre-rename failpoint triggered"
|
||||
)))
|
||||
});
|
||||
|
||||
fs::rename(&temp_file_path, &local_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Could not rename download layer file to {}",
|
||||
local_path.display(),
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
fsync_path(&local_path)
|
||||
.await
|
||||
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
tracing::info!("download complete: {}", local_path.display());
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
pub fn is_temp_download_file(path: &Path) -> bool {
|
||||
let extension = path.extension().map(|pname| {
|
||||
pname
|
||||
.to_str()
|
||||
.expect("paths passed to this function must be valid Rust strings")
|
||||
});
|
||||
match extension {
|
||||
Some(TEMP_DOWNLOAD_EXTENSION) => true,
|
||||
Some(_) => false,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// List timelines of given tenant in remote storage
|
||||
pub async fn list_remote_timelines<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let tenant_storage_path = conf.remote_path(&tenant_path)?;
|
||||
|
||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||
});
|
||||
|
||||
let timelines = download_retry(
|
||||
|| storage.list_prefixes(Some(&tenant_storage_path)),
|
||||
&format!("list prefixes for {tenant_path:?}"),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
anyhow::bail!("no timelines found on the remote storage")
|
||||
}
|
||||
|
||||
let mut timeline_ids = HashSet::new();
|
||||
let mut part_downloads = FuturesUnordered::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
let timeline_id: TimelineId = object_name.parse().with_context(|| {
|
||||
format!("failed to parse object name into timeline id '{object_name}'")
|
||||
})?;
|
||||
|
||||
// list_prefixes returns all files with the prefix. If we haven't seen this timeline ID
|
||||
// yet, launch a download task for it.
|
||||
if !timeline_ids.contains(&timeline_id) {
|
||||
timeline_ids.insert(timeline_id);
|
||||
let storage_clone = storage.clone();
|
||||
part_downloads.push(async move {
|
||||
(
|
||||
timeline_id,
|
||||
download_index_part(conf, &storage_clone, tenant_id, timeline_id)
|
||||
.instrument(info_span!("download_index_part", timeline=%timeline_id))
|
||||
.await,
|
||||
)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for all the download tasks to complete.
|
||||
let mut timeline_parts = Vec::new();
|
||||
while let Some((timeline_id, part_upload_result)) = part_downloads.next().await {
|
||||
let index_part = part_upload_result
|
||||
.with_context(|| format!("Failed to fetch index part for timeline {timeline_id}"))?;
|
||||
|
||||
debug!("Successfully fetched index part for timeline {timeline_id}");
|
||||
timeline_parts.push((timeline_id, index_part));
|
||||
}
|
||||
Ok(timeline_parts)
|
||||
}
|
||||
|
||||
pub async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let index_part_path = conf
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let part_storage_path = conf
|
||||
.remote_path(&index_part_path)
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
|
||||
let index_part_bytes = download_retry(
|
||||
|| async {
|
||||
let mut index_part_download = storage.download(&part_storage_path).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
tokio::io::copy(
|
||||
&mut index_part_download.download_stream,
|
||||
&mut index_part_bytes,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to download an index part into file {index_part_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(index_part_bytes)
|
||||
},
|
||||
&format!("download {part_storage_path:?}"),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let index_part: IndexPartUnclean = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| {
|
||||
format!("Failed to deserialize index part file into file {index_part_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let index_part = index_part.remove_unclean_layer_file_names();
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper function to handle retries for a download operation.
|
||||
///
|
||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||
/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
|
||||
/// with backoff.
|
||||
///
|
||||
/// (See similar logic for uploads in `perform_upload_task`)
|
||||
async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, DownloadError>>,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
info!("{description} succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// These are "permanent" errors that should not be retried.
|
||||
Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
|
||||
return result;
|
||||
}
|
||||
// Assume that any other failure might be transient, and the operation might
|
||||
// succeed if we just keep trying.
|
||||
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
|
||||
info!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
|
||||
warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(DownloadError::Other(ref err)) => {
|
||||
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
|
||||
error!("{description} still failed after {attempts} retries, giving up: {err:?}");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
exponential_backoff(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
@@ -2,46 +2,16 @@
|
||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||
//! remote timeline layers and its metadata.
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use anyhow::{Context, Ok};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tracing::warn;
|
||||
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::{metadata::TimelineMetadata, storage_layer::LayerFileName};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// A part of the filesystem path, that needs a root to become a path again.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct RelativePath(String);
|
||||
|
||||
impl RelativePath {
|
||||
/// Attempts to strip off the base from path, producing a relative path or an error.
|
||||
pub fn from_local_path(timeline_path: &Path, path: &Path) -> anyhow::Result<RelativePath> {
|
||||
let relative = path.strip_prefix(timeline_path).with_context(|| {
|
||||
format!(
|
||||
"path '{}' is not relative to base '{}'",
|
||||
path.display(),
|
||||
timeline_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(Self::from_filename(relative))
|
||||
}
|
||||
|
||||
pub fn from_filename(path: &Path) -> RelativePath {
|
||||
RelativePath(path.to_string_lossy().to_string())
|
||||
}
|
||||
|
||||
pub fn to_local_path(&self, timeline_path: &Path) -> PathBuf {
|
||||
timeline_path.join(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata gathered for each of the layer files.
|
||||
///
|
||||
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
|
||||
@@ -78,9 +48,17 @@ impl LayerFileMetadata {
|
||||
/// Metadata has holes due to version upgrades. This method is called to upgrade self with the
|
||||
/// other value.
|
||||
///
|
||||
/// This is called on the possibly outdated version.
|
||||
pub fn merge(&mut self, other: &Self) {
|
||||
self.file_size = other.file_size.or(self.file_size);
|
||||
/// This is called on the possibly outdated version. Returns true if any changes
|
||||
/// were made.
|
||||
pub fn merge(&mut self, other: &Self) -> bool {
|
||||
let mut changed = false;
|
||||
|
||||
if self.file_size != other.file_size {
|
||||
self.file_size = other.file_size.or(self.file_size);
|
||||
changed = true;
|
||||
}
|
||||
|
||||
changed
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,26 +70,25 @@ impl LayerFileMetadata {
|
||||
/// remember to add a test case for the changed version.
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexPart {
|
||||
pub struct IndexPartImpl<L>
|
||||
where
|
||||
L: std::hash::Hash + PartialEq + Eq,
|
||||
{
|
||||
/// Debugging aid describing the version of this type.
|
||||
#[serde(default)]
|
||||
version: usize,
|
||||
|
||||
/// Each of the layers present on remote storage.
|
||||
/// Layer names, which are stored on the remote storage.
|
||||
///
|
||||
/// Additional metadata can might exist in `layer_metadata`.
|
||||
pub timeline_layers: HashSet<RelativePath>,
|
||||
pub timeline_layers: HashSet<L>,
|
||||
|
||||
/// FIXME: unused field. This should be removed, but that changes the on-disk format,
|
||||
/// so we need to make sure we're backwards- (and maybe forwards-) compatible
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
|
||||
/// Per layer file metadata, which can be present for a present or missing layer file.
|
||||
/// Per layer file name metadata, which can be present for a present or missing layer file.
|
||||
///
|
||||
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
|
||||
/// that latest version stores.
|
||||
#[serde(default)]
|
||||
pub layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
|
||||
#[serde(default = "HashMap::default")]
|
||||
pub layer_metadata: HashMap<L, IndexLayerMetadata>,
|
||||
|
||||
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
|
||||
// It's duplicated here for convenience.
|
||||
@@ -120,6 +97,101 @@ pub struct IndexPart {
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
// TODO seems like another part of the remote storage file format
|
||||
// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
|
||||
pub type IndexPart = IndexPartImpl<LayerFileName>;
|
||||
|
||||
pub type IndexPartUnclean = IndexPartImpl<UncleanLayerFileName>;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
pub enum UncleanLayerFileName {
|
||||
Clean(LayerFileName),
|
||||
BackupFile(String),
|
||||
}
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for UncleanLayerFileName {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
deserializer.deserialize_string(UncleanLayerFileNameVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
struct UncleanLayerFileNameVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for UncleanLayerFileNameVisitor {
|
||||
type Value = UncleanLayerFileName;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(
|
||||
formatter,
|
||||
"a string that is a valid LayerFileName or '.old' backup file name"
|
||||
)
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
let maybe_clean: Result<LayerFileName, _> = v.parse();
|
||||
match maybe_clean {
|
||||
Ok(clean) => Ok(UncleanLayerFileName::Clean(clean)),
|
||||
Err(e) => {
|
||||
if v.ends_with(".old") || v == "metadata_backup" {
|
||||
Ok(UncleanLayerFileName::BackupFile(v.to_owned()))
|
||||
} else {
|
||||
Err(E::custom(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UncleanLayerFileName {
|
||||
fn into_clean(self) -> Option<LayerFileName> {
|
||||
match self {
|
||||
UncleanLayerFileName::Clean(clean) => Some(clean),
|
||||
UncleanLayerFileName::BackupFile(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexPartUnclean {
|
||||
pub fn remove_unclean_layer_file_names(self) -> IndexPart {
|
||||
let IndexPartUnclean {
|
||||
version,
|
||||
timeline_layers,
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
} = self;
|
||||
|
||||
IndexPart {
|
||||
version,
|
||||
timeline_layers: timeline_layers
|
||||
.into_iter()
|
||||
.filter_map(|unclean_file_name| match unclean_file_name {
|
||||
UncleanLayerFileName::Clean(clean_name) => Some(clean_name),
|
||||
UncleanLayerFileName::BackupFile(backup_file_name) => {
|
||||
// For details see https://github.com/neondatabase/neon/issues/3024
|
||||
warn!(
|
||||
"got backup file on the remote storage, ignoring it {backup_file_name}"
|
||||
);
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
layer_metadata: layer_metadata
|
||||
.into_iter()
|
||||
.filter_map(|(l, m)| l.into_clean().map(|l| (l, m)))
|
||||
.collect(),
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
|
||||
/// used to understand later versions.
|
||||
@@ -129,23 +201,22 @@ impl IndexPart {
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub fn new(
|
||||
layers_and_metadata: HashMap<RelativePath, LayerFileMetadata>,
|
||||
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
) -> Self {
|
||||
let mut timeline_layers = HashSet::new();
|
||||
let mut layer_metadata = HashMap::new();
|
||||
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
|
||||
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
|
||||
|
||||
separate_paths_and_metadata(
|
||||
&layers_and_metadata,
|
||||
&mut timeline_layers,
|
||||
&mut layer_metadata,
|
||||
);
|
||||
for (remote_name, metadata) in &layers_and_metadata {
|
||||
timeline_layers.insert(remote_name.to_owned());
|
||||
let metadata = IndexLayerMetadata::from(metadata);
|
||||
layer_metadata.insert(remote_name.to_owned(), metadata);
|
||||
}
|
||||
|
||||
Self {
|
||||
version: Self::LATEST_VERSION,
|
||||
timeline_layers,
|
||||
missing_layers: HashSet::new(),
|
||||
layer_metadata,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
@@ -160,7 +231,7 @@ impl IndexPart {
|
||||
/// Serialized form of [`LayerFileMetadata`].
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct IndexLayerMetadata {
|
||||
file_size: Option<u64>,
|
||||
pub(super) file_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
@@ -171,18 +242,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
fn separate_paths_and_metadata(
|
||||
input: &HashMap<RelativePath, LayerFileMetadata>,
|
||||
output: &mut HashSet<RelativePath>,
|
||||
layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
|
||||
) {
|
||||
for (path, metadata) in input {
|
||||
let metadata = IndexLayerMetadata::from(metadata);
|
||||
layer_metadata.insert(path.clone(), metadata);
|
||||
output.insert(path.clone());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -191,21 +250,20 @@ mod tests {
|
||||
fn v0_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["not_a_real_layer_but_adding_coverage"],
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 0,
|
||||
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
|
||||
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::default(),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part: IndexPartUnclean = serde_json::from_str(example).unwrap();
|
||||
let part = part.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
@@ -214,10 +272,9 @@ mod tests {
|
||||
let example = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["not_a_real_layer_but_adding_coverage"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
"LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
@@ -226,13 +283,12 @@ mod tests {
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
|
||||
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
(RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
(RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
|
||||
(LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: Some(9007199254741001),
|
||||
@@ -242,7 +298,46 @@ mod tests {
|
||||
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example)
|
||||
.unwrap()
|
||||
.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v1_indexpart_is_parsed_with_optional_missing_layers() {
|
||||
let example = r#"{
|
||||
"version":1,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["This shouldn't fail deserialization"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"LAYER_FILE_NAME::test/not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 1,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: Some(25600000),
|
||||
}),
|
||||
(LayerFileName::new_test("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: Some(9007199254741001),
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPartUnclean>(example).unwrap();
|
||||
let part = part.remove_unclean_layer_file_names();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
}
|
||||
@@ -5,12 +5,12 @@ use fail::fail_point;
|
||||
use std::path::Path;
|
||||
use tokio::fs;
|
||||
|
||||
use super::index::IndexPart;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::storage_sync::LayerFileMetadata;
|
||||
use crate::{config::PageServerConf, tenant::remote_timeline_client::index::IndexPart};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::LayerFileMetadata;
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -30,12 +30,9 @@ pub(super) async fn upload_index_part<'a>(
|
||||
let index_part_path = conf
|
||||
.metadata_path(timeline_id, tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let storage_path = conf.remote_path(&index_part_path)?;
|
||||
storage
|
||||
.upload_storage_object(
|
||||
Box::new(index_part_bytes),
|
||||
index_part_size,
|
||||
&index_part_path,
|
||||
)
|
||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
@@ -44,36 +41,26 @@ pub(super) async fn upload_index_part<'a>(
|
||||
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
pub(super) async fn upload_timeline_layer(
|
||||
storage: &GenericRemoteStorage,
|
||||
source_path: &Path,
|
||||
known_metadata: &LayerFileMetadata,
|
||||
pub(super) async fn upload_timeline_layer<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
source_path: &'a Path,
|
||||
known_metadata: &'a LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
fail_point!("before-upload-layer", |_| {
|
||||
bail!("failpoint before-upload-layer")
|
||||
});
|
||||
let storage_path = storage.remote_object_id(source_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?;
|
||||
let storage_path = conf.remote_path(source_path)?;
|
||||
|
||||
let source_file = fs::File::open(&source_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a source file for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})?;
|
||||
let source_file = fs::File::open(&source_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;
|
||||
|
||||
let fs_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the source file metadata for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
format!("Failed to get the source file metadata for layer {source_path:?}")
|
||||
})?
|
||||
.len();
|
||||
|
||||
@@ -3,8 +3,11 @@ use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::sync::oneshot::error::RecvError;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
|
||||
use super::Tenant;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -67,6 +70,7 @@ pub(super) async fn gather_inputs(
|
||||
|
||||
let timelines = tenant
|
||||
.refresh_gc_info()
|
||||
.await
|
||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
@@ -93,8 +97,6 @@ pub(super) async fn gather_inputs(
|
||||
// used to determine the `retention_period` for the size model
|
||||
let mut max_cutoff_distance = None;
|
||||
|
||||
// this will probably conflict with on-demand downloaded layers, or at least force them all
|
||||
// to be downloaded
|
||||
for timeline in timelines {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
@@ -212,11 +214,30 @@ pub(super) async fn gather_inputs(
|
||||
let mut have_any_error = false;
|
||||
|
||||
while let Some(res) = joinset.join_next().await {
|
||||
// each of these come with Result<Result<_, JoinError>, JoinError>
|
||||
// each of these come with Result<anyhow::Result<_>, JoinError>
|
||||
// because of spawn + spawn_blocking
|
||||
let res = res.and_then(|inner| inner);
|
||||
match res {
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures, nor should be");
|
||||
}
|
||||
Err(join_error) => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
|
||||
have_any_error = true;
|
||||
}
|
||||
Ok(Err(recv_result_error)) => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
error!("failed to receive logical size query result: {recv_result_error:#}");
|
||||
have_any_error = true;
|
||||
}
|
||||
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
|
||||
warn!(
|
||||
timeline_id=%timeline.timeline_id,
|
||||
"failed to calculate logical size at {lsn}: {error:#}"
|
||||
);
|
||||
have_any_error = true;
|
||||
}
|
||||
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
|
||||
debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
|
||||
|
||||
logical_size_cache.insert((timeline.timeline_id, lsn), size);
|
||||
@@ -228,21 +249,6 @@ pub(super) async fn gather_inputs(
|
||||
command: Command::Update(size),
|
||||
});
|
||||
}
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
|
||||
warn!(
|
||||
timeline_id=%timeline.timeline_id,
|
||||
"failed to calculate logical size at {lsn}: {error:#}"
|
||||
);
|
||||
have_any_error = true;
|
||||
}
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures, nor should be");
|
||||
}
|
||||
Err(join_error) => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
error!("logical size query panicked: {join_error:#}");
|
||||
have_any_error = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -351,7 +357,7 @@ enum LsnKind {
|
||||
struct TimelineAtLsnSizeResult(
|
||||
Arc<crate::tenant::Timeline>,
|
||||
utils::lsn::Lsn,
|
||||
anyhow::Result<u64>,
|
||||
Result<u64, CalculateLogicalSizeError>,
|
||||
);
|
||||
|
||||
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
|
||||
@@ -359,17 +365,15 @@ async fn calculate_logical_size(
|
||||
limit: Arc<tokio::sync::Semaphore>,
|
||||
timeline: Arc<crate::tenant::Timeline>,
|
||||
lsn: utils::lsn::Lsn,
|
||||
) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
|
||||
let permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||
) -> Result<TimelineAtLsnSizeResult, RecvError> {
|
||||
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||
.await
|
||||
.expect("global semaphore should not had been closed");
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _permit = permit;
|
||||
let size_res = timeline.calculate_logical_size(lsn);
|
||||
TimelineAtLsnSizeResult(timeline, lsn, size_res)
|
||||
})
|
||||
.await
|
||||
let size_res = timeline
|
||||
.spawn_ondemand_logical_size_calculation(lsn)
|
||||
.await?;
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
//!
|
||||
//! Common traits and structs for layers
|
||||
//!
|
||||
|
||||
mod delta_layer;
|
||||
mod filename;
|
||||
mod image_layer;
|
||||
mod inmemory_layer;
|
||||
mod remote_layer;
|
||||
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
@@ -8,12 +12,19 @@ use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
pub use filename::{DeltaFileName, ImageFileName, LayerFileName, PathOrConf};
|
||||
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
pub use inmemory_layer::InMemoryLayer;
|
||||
pub use remote_layer::RemoteLayer;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
@@ -69,26 +80,9 @@ pub enum ValueReconstructResult {
|
||||
Missing,
|
||||
}
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
/// layers are used to ingest incoming WAL, and provide fast access to the
|
||||
/// recent page versions. On-disk layers are stored as files on disk, and are
|
||||
/// immutable. This trait presents the common functionality of in-memory and
|
||||
/// on-disk layers.
|
||||
///
|
||||
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
|
||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN
|
||||
///
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
/// required by [`LayerMap`].
|
||||
pub trait Layer: Send + Sync {
|
||||
fn get_tenant_id(&self) -> TenantId;
|
||||
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> TimelineId;
|
||||
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
@@ -100,13 +94,11 @@ pub trait Layer: Send + Sync {
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
fn get_lsn_range(&self) -> Range<Lsn>;
|
||||
|
||||
/// Filename used to store this layer on disk. (Even in-memory layers
|
||||
/// implement this, to print a handy unique identifier for the layer for
|
||||
/// log messages, even though they're never not on disk.)
|
||||
fn filename(&self) -> PathBuf;
|
||||
|
||||
/// If a layer has a corresponding file on a local filesystem, return its absolute path.
|
||||
fn local_path(&self) -> Option<PathBuf>;
|
||||
/// Does this layer only contain some data for the key-range (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
/// for garbage collecting old layers: an incremental layer depends on
|
||||
/// the previous non-incremental layer.
|
||||
fn is_incremental(&self) -> bool;
|
||||
|
||||
///
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
@@ -127,35 +119,88 @@ pub trait Layer: Send + Sync {
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Does this layer only contain some data for the key-range (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
/// for garbage collecting old layers: an incremental layer depends on
|
||||
/// the previous non-incremental layer.
|
||||
fn is_incremental(&self) -> bool;
|
||||
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
|
||||
fn short_id(&self) -> String;
|
||||
|
||||
/// Returns true for layers that are represented in memory.
|
||||
fn is_in_memory(&self) -> bool;
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Returned by [`Layer::iter`]
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
|
||||
|
||||
/// Returned by [`Layer::key_iter`]
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
/// layers are used to ingest incoming WAL, and provide fast access to the
|
||||
/// recent page versions. On-disk layers are stored as files on disk, and are
|
||||
/// immutable. This trait presents the common functionality of in-memory and
|
||||
/// on-disk layers.
|
||||
///
|
||||
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
|
||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN
|
||||
///
|
||||
pub trait PersistentLayer: Layer {
|
||||
fn get_tenant_id(&self) -> TenantId;
|
||||
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> TimelineId;
|
||||
|
||||
/// File name used for this layer, both in the pageserver's local filesystem
|
||||
/// state as well as in the remote storage.
|
||||
fn filename(&self) -> LayerFileName;
|
||||
|
||||
// Path to the layer file in the local filesystem.
|
||||
// `None` for `RemoteLayer`.
|
||||
fn local_path(&self) -> Option<PathBuf>;
|
||||
|
||||
/// Iterate through all keys and values stored in the layer
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
|
||||
fn iter(&self) -> Result<LayerIter<'_>>;
|
||||
|
||||
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
|
||||
/// It is used only for compaction and so is currently implemented only for DeltaLayer
|
||||
fn key_iter(&self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + '_> {
|
||||
fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete(&self) -> Result<()>;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool) -> Result<()>;
|
||||
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
None
|
||||
}
|
||||
|
||||
fn is_remote_layer(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Returns None if the layer file size is not known.
|
||||
///
|
||||
/// Should not change over the lifetime of the layer object because
|
||||
/// current_physical_size is computed as the som of this value.
|
||||
fn file_size(&self) -> Option<u64>;
|
||||
}
|
||||
|
||||
pub fn downcast_remote_layer(
|
||||
layer: &Arc<dyn PersistentLayer>,
|
||||
) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
if layer.is_remote_layer() {
|
||||
Arc::clone(layer).downcast_remote_layer()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for dyn Layer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Layer")
|
||||
.field("filename", &self.filename())
|
||||
.field("short_id", &self.short_id())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,15 +29,16 @@ use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
PersistentLayer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
@@ -52,6 +53,8 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{DeltaFileName, Layer, LayerFileName, LayerIter, LayerKeyIter, PathOrConf};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
///
|
||||
@@ -178,6 +181,8 @@ pub struct DeltaLayer {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_size: u64,
|
||||
|
||||
inner: RwLock<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
@@ -194,14 +199,6 @@ pub struct DeltaLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
@@ -209,13 +206,86 @@ impl Layer for DeltaLayer {
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
fn is_incremental(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
}
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
inner.index_start_blk, inner.index_root_blk
|
||||
);
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
tree_reader.dump()?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, val| {
|
||||
let blob_ref = BlobRef(val);
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
@@ -302,29 +372,38 @@ impl Layer for DeltaLayer {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
|
||||
let inner = match self.load() {
|
||||
Ok(inner) => inner,
|
||||
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
|
||||
};
|
||||
|
||||
match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
Err(err) => Box::new(std::iter::once(Err(err))),
|
||||
}
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn key_iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'a> {
|
||||
let inner = match self.load() {
|
||||
Ok(inner) => inner,
|
||||
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
|
||||
};
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
match DeltaKeyIter::new(inner) {
|
||||
fn filename(&self) -> LayerFileName {
|
||||
self.layer_name().into()
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn iter(&self) -> Result<LayerIter<'_>> {
|
||||
let inner = self.load().context("load delta layer")?;
|
||||
Ok(match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
Err(e) => panic!("Layer index is corrupted: {e:?}"),
|
||||
}
|
||||
Err(err) => Box::new(std::iter::once(Err(err))),
|
||||
})
|
||||
}
|
||||
|
||||
fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
|
||||
let inner = self.load()?;
|
||||
Ok(Box::new(
|
||||
DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
@@ -333,87 +412,8 @@ impl Layer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
inner.index_start_blk, inner.index_root_blk
|
||||
);
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
tree_reader.dump()?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, val| {
|
||||
let blob_ref = BlobRef(val);
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
Some(self.file_size)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -511,8 +511,8 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = Path::new(path.file_name().unwrap());
|
||||
let expected_filename = self.filename();
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
@@ -539,6 +539,7 @@ impl DeltaLayer {
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
filename: &DeltaFileName,
|
||||
file_size: u64,
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
@@ -546,6 +547,7 @@ impl DeltaLayer {
|
||||
tenant_id,
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn_range: filename.lsn_range.clone(),
|
||||
file_size,
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
@@ -558,21 +560,23 @@ impl DeltaLayer {
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
|
||||
pub fn new_for_path<F>(path: &Path, file: F) -> Result<Self>
|
||||
where
|
||||
F: FileExt,
|
||||
{
|
||||
pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
|
||||
let mut summary_buf = Vec::new();
|
||||
summary_buf.resize(PAGE_SZ, 0);
|
||||
file.read_exact_at(&mut summary_buf, 0)?;
|
||||
let summary = Summary::des_prefix(&summary_buf)?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
.context("get file metadata to determine size")?;
|
||||
|
||||
Ok(DeltaLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timeline_id: summary.timeline_id,
|
||||
tenant_id: summary.tenant_id,
|
||||
key_range: summary.key_range,
|
||||
lsn_range: summary.lsn_range,
|
||||
file_size: metadata.len(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
@@ -729,6 +733,10 @@ impl DeltaLayerWriterInner {
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
Summary::ser_into(&summary, &mut file)?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
.context("get file metadata to determine size")?;
|
||||
|
||||
// Note: Because we opened the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
@@ -738,6 +746,7 @@ impl DeltaLayerWriterInner {
|
||||
timeline_id: self.timeline_id,
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
file_size: metadata.len(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
@@ -7,11 +7,12 @@ use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// Note: Timeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub struct DeltaFileName {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
@@ -101,7 +102,7 @@ impl fmt::Display for DeltaFileName {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
|
||||
pub struct ImageFileName {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn: Lsn,
|
||||
@@ -172,6 +173,103 @@ impl fmt::Display for ImageFileName {
|
||||
)
|
||||
}
|
||||
}
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
pub enum LayerFileName {
|
||||
Image(ImageFileName),
|
||||
Delta(DeltaFileName),
|
||||
#[cfg(test)]
|
||||
Test(String),
|
||||
}
|
||||
|
||||
impl LayerFileName {
|
||||
pub fn file_name(&self) -> String {
|
||||
match self {
|
||||
LayerFileName::Image(fname) => format!("{fname}"),
|
||||
LayerFileName::Delta(fname) => format!("{fname}"),
|
||||
#[cfg(test)]
|
||||
LayerFileName::Test(fname) => fname.to_string(),
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
pub(crate) fn new_test(name: &str) -> LayerFileName {
|
||||
LayerFileName::Test(name.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImageFileName> for LayerFileName {
|
||||
fn from(fname: ImageFileName) -> Self {
|
||||
LayerFileName::Image(fname)
|
||||
}
|
||||
}
|
||||
impl From<DeltaFileName> for LayerFileName {
|
||||
fn from(fname: DeltaFileName) -> Self {
|
||||
LayerFileName::Delta(fname)
|
||||
}
|
||||
}
|
||||
|
||||
// include a `/` in the name as an additional layer of robustness
|
||||
// because `/` chars are not allowed in UNIX paths
|
||||
#[cfg(test)]
|
||||
const LAYER_FILE_NAME_TEST_PREFIX: &str = "LAYER_FILE_NAME::test/";
|
||||
|
||||
impl FromStr for LayerFileName {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(value: &str) -> Result<Self, Self::Err> {
|
||||
#[cfg(test)]
|
||||
if let Some(value) = value.strip_prefix(LAYER_FILE_NAME_TEST_PREFIX) {
|
||||
return Ok(LayerFileName::Test(value.to_owned()));
|
||||
}
|
||||
let delta = DeltaFileName::parse_str(value);
|
||||
let image = ImageFileName::parse_str(value);
|
||||
let ok = match (delta, image) {
|
||||
(None, None) => {
|
||||
return Err(format!(
|
||||
"neither delta nor image layer file name: {value:?}"
|
||||
))
|
||||
}
|
||||
(Some(delta), None) => LayerFileName::Delta(delta),
|
||||
(None, Some(image)) => LayerFileName::Image(image),
|
||||
(Some(_), Some(_)) => unreachable!(),
|
||||
};
|
||||
Ok(ok)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for LayerFileName {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
match self {
|
||||
LayerFileName::Image(fname) => serializer.serialize_str(&format!("{}", fname)),
|
||||
LayerFileName::Delta(fname) => serializer.serialize_str(&format!("{}", fname)),
|
||||
#[cfg(test)]
|
||||
LayerFileName::Test(t) => {
|
||||
serializer.serialize_str(&format!("{LAYER_FILE_NAME_TEST_PREFIX}{t}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct LayerFileNameVisitor;
|
||||
|
||||
impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
|
||||
type Value = LayerFileName;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
formatter,
|
||||
"a string that is a valid image or delta layer file name"
|
||||
)
|
||||
}
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
v.parse().map_err(|e| E::custom(e))
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper enum to hold a PageServerConf, or a path
|
||||
///
|
||||
@@ -21,12 +21,13 @@
|
||||
//! actual page images are stored in the "values" part.
|
||||
use crate::config::PageServerConf;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::repository::{Key, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::filename::{ImageFileName, PathOrConf};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
PersistentLayer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
@@ -34,10 +35,11 @@ use bytes::Bytes;
|
||||
use hex;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::fs::{self, File};
|
||||
use std::io::Write;
|
||||
use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
@@ -48,6 +50,9 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::filename::{ImageFileName, LayerFileName, PathOrConf};
|
||||
use super::{Layer, LayerIter};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
///
|
||||
@@ -100,6 +105,7 @@ pub struct ImageLayer {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key_range: Range<Key>,
|
||||
pub file_size: u64,
|
||||
|
||||
// This entry contains an image of all pages as of this LSN
|
||||
pub lsn: Lsn,
|
||||
@@ -120,22 +126,6 @@ pub struct ImageLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
@@ -144,58 +134,12 @@ impl Layer for ImageLayer {
|
||||
// End-bound is exclusive
|
||||
self.lsn..(self.lsn + 1)
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.filename().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
@@ -223,6 +167,72 @@ impl Layer for ImageLayer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.path().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for ImageLayer {
|
||||
fn filename(&self) -> LayerFileName {
|
||||
self.layer_name().into()
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
fn iter(&self) -> Result<LayerIter<'_>> {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
Some(self.file_size)
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
@@ -314,8 +324,8 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = Path::new(path.file_name().unwrap());
|
||||
let expected_filename = self.filename();
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
@@ -339,6 +349,7 @@ impl ImageLayer {
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
filename: &ImageFileName,
|
||||
file_size: u64,
|
||||
) -> ImageLayer {
|
||||
ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
@@ -346,6 +357,7 @@ impl ImageLayer {
|
||||
tenant_id,
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn: filename.lsn,
|
||||
file_size,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
@@ -358,21 +370,21 @@ impl ImageLayer {
|
||||
/// Create an ImageLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
|
||||
pub fn new_for_path<F>(path: &Path, file: F) -> Result<ImageLayer>
|
||||
where
|
||||
F: std::os::unix::prelude::FileExt,
|
||||
{
|
||||
pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
|
||||
let mut summary_buf = Vec::new();
|
||||
summary_buf.resize(PAGE_SZ, 0);
|
||||
file.read_exact_at(&mut summary_buf, 0)?;
|
||||
let summary = Summary::des_prefix(&summary_buf)?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
.context("get file metadata to determine size")?;
|
||||
Ok(ImageLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timeline_id: summary.timeline_id,
|
||||
tenant_id: summary.tenant_id,
|
||||
key_range: summary.key_range,
|
||||
lsn: summary.lsn,
|
||||
file_size: metadata.len(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
file: None,
|
||||
loaded: false,
|
||||
@@ -518,6 +530,10 @@ impl ImageLayerWriterInner {
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
Summary::ser_into(&summary, &mut file)?;
|
||||
|
||||
let metadata = file
|
||||
.metadata()
|
||||
.context("get metadata to determine file size")?;
|
||||
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
@@ -527,6 +543,7 @@ impl ImageLayerWriterInner {
|
||||
tenant_id: self.tenant_id,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
file_size: metadata.len(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
@@ -551,7 +568,7 @@ impl ImageLayerWriterInner {
|
||||
lsn: self.lsn,
|
||||
},
|
||||
);
|
||||
std::fs::rename(self.path, &final_path)?;
|
||||
std::fs::rename(self.path, final_path)?;
|
||||
|
||||
trace!("created image layer {}", layer.path().display());
|
||||
|
||||
@@ -8,11 +8,10 @@ use crate::config::PageServerConf;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
use crate::walrecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{ensure, Result};
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use tracing::*;
|
||||
@@ -26,9 +25,10 @@ use utils::{
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use super::{DeltaLayer, DeltaLayerWriter, Layer};
|
||||
|
||||
thread_local! {
|
||||
/// A buffer for serializing object during [`InMemoryLayer::put_value`].
|
||||
/// This buffer is reused for each serialization to avoid additional malloc calls.
|
||||
@@ -75,33 +75,13 @@ impl InMemoryLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
// An in-memory layer can be spilled to disk into ephemeral file,
|
||||
// This function is used only for debugging, so we don't need to be very precise.
|
||||
// Construct a filename as if it was a delta layer.
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
|
||||
PathBuf::from(format!(
|
||||
"inmem-{:016X}-{:016X}",
|
||||
self.start_lsn.0, end_lsn.0
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
impl InMemoryLayer {
|
||||
pub fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timeline_id
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
}
|
||||
@@ -117,72 +97,16 @@ impl Layer for InMemoryLayer {
|
||||
self.start_lsn..end_lsn
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let mut reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
/// Nothing to do here. When you drop the last reference to the layer, it will
|
||||
/// be deallocated.
|
||||
fn delete(&self) -> Result<()> {
|
||||
bail!("can't delete an InMemoryLayer")
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
// in-memory layer is always considered incremental.
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
true
|
||||
fn short_id(&self) -> String {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
@@ -235,6 +159,55 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let mut reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
210
pageserver/src/tenant/storage_layer/remote_layer.rs
Normal file
210
pageserver/src/tenant/storage_layer/remote_layer.rs
Normal file
@@ -0,0 +1,210 @@
|
||||
//! A RemoteLayer is an in-memory placeholder for a layer file that exists
|
||||
//! in remote storage.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use anyhow::{bail, Result};
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
use super::image_layer::ImageLayer;
|
||||
use super::{DeltaLayer, LayerIter, LayerKeyIter, PersistentLayer};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RemoteLayer {
|
||||
tenantid: TenantId,
|
||||
timelineid: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_name: LayerFileName,
|
||||
|
||||
pub layer_metadata: LayerFileMetadata,
|
||||
|
||||
is_delta: bool,
|
||||
|
||||
is_incremental: bool,
|
||||
|
||||
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
|
||||
impl Layer for RemoteLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_state: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
bail!(
|
||||
"layer {} needs to be downloaded",
|
||||
self.filename().file_name()
|
||||
);
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.is_incremental
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, _verbose: bool) -> Result<()> {
|
||||
println!(
|
||||
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
self.filename().file_name()
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayer for RemoteLayer {
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
|
||||
fn filename(&self) -> LayerFileName {
|
||||
if self.is_delta {
|
||||
DeltaFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
}
|
||||
.into()
|
||||
} else {
|
||||
ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn_range.start,
|
||||
}
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn iter(&self) -> Result<LayerIter<'_>> {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn key_iter(&self) -> Result<LayerKeyIter<'_>> {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn is_remote_layer(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn file_size(&self) -> Option<u64> {
|
||||
self.layer_metadata.file_size()
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteLayer {
|
||||
pub fn new_img(
|
||||
tenantid: TenantId,
|
||||
timelineid: TimelineId,
|
||||
fname: &ImageFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> RemoteLayer {
|
||||
RemoteLayer {
|
||||
tenantid,
|
||||
timelineid,
|
||||
key_range: fname.key_range.clone(),
|
||||
lsn_range: fname.lsn..(fname.lsn + 1),
|
||||
is_delta: false,
|
||||
is_incremental: false,
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_delta(
|
||||
tenantid: TenantId,
|
||||
timelineid: TimelineId,
|
||||
fname: &DeltaFileName,
|
||||
layer_metadata: &LayerFileMetadata,
|
||||
) -> RemoteLayer {
|
||||
RemoteLayer {
|
||||
tenantid,
|
||||
timelineid,
|
||||
key_range: fname.key_range.clone(),
|
||||
lsn_range: fname.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
is_incremental: true,
|
||||
file_name: fname.to_owned().into(),
|
||||
layer_metadata: layer_metadata.clone(),
|
||||
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a Layer struct representing this layer, after it has been downloaded.
|
||||
pub fn create_downloaded_layer(
|
||||
&self,
|
||||
conf: &'static PageServerConf,
|
||||
file_size: u64,
|
||||
) -> Arc<dyn PersistentLayer> {
|
||||
if self.is_delta {
|
||||
let fname = DeltaFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
};
|
||||
Arc::new(DeltaLayer::new(
|
||||
conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&fname,
|
||||
file_size,
|
||||
))
|
||||
} else {
|
||||
let fname = ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn_range.start,
|
||||
};
|
||||
Arc::new(ImageLayer::new(
|
||||
conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&fname,
|
||||
file_size,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8,8 +8,8 @@ use std::time::Duration;
|
||||
use crate::metrics::TENANT_TASK_EVENTS;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::tenant_mgr;
|
||||
use tracing::*;
|
||||
use utils::id::TenantId;
|
||||
|
||||
@@ -127,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
} else {
|
||||
// Run gc
|
||||
if gc_horizon > 0 {
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
|
||||
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
|
||||
@@ -155,7 +155,7 @@ async fn wait_for_active_tenant(
|
||||
wait: Duration,
|
||||
) -> ControlFlow<(), Arc<Tenant>> {
|
||||
let tenant = loop {
|
||||
match tenant_mgr::get_tenant(tenant_id, false) {
|
||||
match mgr::get_tenant(tenant_id, false).await {
|
||||
Ok(tenant) => break tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to get a tenant {tenant_id}: {e:#}");
|
||||
File diff suppressed because it is too large
Load Diff
213
pageserver/src/tenant/upload_queue.rs
Normal file
213
pageserver/src/tenant/upload_queue.rs
Normal file
@@ -0,0 +1,213 @@
|
||||
use crate::metrics::RemoteOpFileKind;
|
||||
|
||||
use super::storage_layer::LayerFileName;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::fmt::Debug;
|
||||
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// clippy warns that Uninitialized is much smaller than Initialized, which wastes
|
||||
// memory for Uninitialized variants. Doesn't matter in practice, there are not
|
||||
// that many upload queues in a running pageserver, and most of them are initialized
|
||||
// anyway.
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
pub(crate) enum UploadQueue {
|
||||
Uninitialized,
|
||||
Initialized(UploadQueueInitialized),
|
||||
Stopped(UploadQueueStopped),
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => "Uninitialized",
|
||||
UploadQueue::Initialized(_) => "Initialized",
|
||||
UploadQueue::Stopped(_) => "Stopped",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This keeps track of queued and in-progress tasks.
|
||||
pub(crate) struct UploadQueueInitialized {
|
||||
/// Counter to assign task IDs
|
||||
pub(crate) task_counter: u64,
|
||||
|
||||
/// All layer files stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations
|
||||
pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
|
||||
|
||||
/// How many file uploads or deletions been scheduled, since the
|
||||
/// last (scheduling of) metadata index upload?
|
||||
pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64,
|
||||
|
||||
/// Metadata stored in the remote storage, taking into account all
|
||||
/// in-progress and queued operations.
|
||||
/// DANGER: do not return to outside world, e.g., safekeepers.
|
||||
pub(crate) latest_metadata: TimelineMetadata,
|
||||
|
||||
/// `disk_consistent_lsn` from the last metadata file that was successfully
|
||||
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
|
||||
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
|
||||
/// Safekeeper can rely on it to make decisions for WAL storage.
|
||||
pub(crate) last_uploaded_consistent_lsn: Lsn,
|
||||
|
||||
// Breakdown of different kinds of tasks currently in-progress
|
||||
pub(crate) num_inprogress_layer_uploads: usize,
|
||||
pub(crate) num_inprogress_metadata_uploads: usize,
|
||||
pub(crate) num_inprogress_deletions: usize,
|
||||
|
||||
/// Tasks that are currently in-progress. In-progress means that a tokio Task
|
||||
/// has been launched for it. An in-progress task can be busy uploading, but it can
|
||||
/// also be waiting on the `concurrency_limiter` Semaphore in S3Bucket, or it can
|
||||
/// be waiting for retry in `exponential_backoff`.
|
||||
pub(crate) inprogress_tasks: HashMap<u64, Arc<UploadTask>>,
|
||||
|
||||
/// Queued operations that have not been launched yet. They might depend on previous
|
||||
/// tasks to finish. For example, metadata upload cannot be performed before all
|
||||
/// preceding layer file uploads have completed.
|
||||
pub(crate) queued_operations: VecDeque<UploadOp>,
|
||||
}
|
||||
|
||||
pub(crate) struct UploadQueueStopped {
|
||||
pub(crate) last_uploaded_consistent_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl UploadQueue {
|
||||
pub(crate) fn initialize_empty_remote(
|
||||
&mut self,
|
||||
metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => (),
|
||||
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("already initialized, state {}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
info!("initializing upload queue for empty remote");
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
|
||||
latest_files: HashMap::new(),
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: metadata.clone(),
|
||||
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
|
||||
// safekeepers from garbage-collecting anything.
|
||||
last_uploaded_consistent_lsn: Lsn(0),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::new(),
|
||||
queued_operations: VecDeque::new(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
Ok(self.initialized_mut().expect("we just set it"))
|
||||
}
|
||||
|
||||
pub(crate) fn initialize_with_current_remote_index_part(
|
||||
&mut self,
|
||||
index_part: &IndexPart,
|
||||
) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
UploadQueue::Uninitialized => (),
|
||||
UploadQueue::Initialized(_) | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("already initialized, state {}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
|
||||
for layer_name in &index_part.timeline_layers {
|
||||
let layer_metadata = index_part
|
||||
.layer_metadata
|
||||
.get(layer_name)
|
||||
.map(LayerFileMetadata::from)
|
||||
.unwrap_or(LayerFileMetadata::MISSING);
|
||||
files.insert(layer_name.to_owned(), layer_metadata);
|
||||
}
|
||||
|
||||
let index_part_metadata = index_part.parse_metadata()?;
|
||||
info!(
|
||||
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
|
||||
index_part_metadata.disk_consistent_lsn()
|
||||
);
|
||||
|
||||
let state = UploadQueueInitialized {
|
||||
latest_files: files,
|
||||
latest_files_changes_since_metadata_upload_scheduled: 0,
|
||||
latest_metadata: index_part_metadata.clone(),
|
||||
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
|
||||
// what follows are boring default initializations
|
||||
task_counter: 0,
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::new(),
|
||||
queued_operations: VecDeque::new(),
|
||||
};
|
||||
|
||||
*self = UploadQueue::Initialized(state);
|
||||
Ok(self.initialized_mut().expect("we just set it"))
|
||||
}
|
||||
|
||||
pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
|
||||
match self {
|
||||
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
|
||||
anyhow::bail!("queue is in state {}", self.as_str())
|
||||
}
|
||||
UploadQueue::Initialized(x) => Ok(x),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An in-progress upload or delete task.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct UploadTask {
|
||||
/// Unique ID of this task. Used as the key in `inprogress_tasks` above.
|
||||
pub(crate) task_id: u64,
|
||||
pub(crate) retries: AtomicU32,
|
||||
|
||||
pub(crate) op: UploadOp,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum UploadOp {
|
||||
/// Upload a layer file
|
||||
UploadLayer(LayerFileName, LayerFileMetadata),
|
||||
|
||||
/// Upload the metadata file
|
||||
UploadMetadata(IndexPart, Lsn),
|
||||
|
||||
/// Delete a file.
|
||||
Delete(RemoteOpFileKind, LayerFileName),
|
||||
|
||||
/// Barrier. When the barrier operation is reached,
|
||||
Barrier(tokio::sync::watch::Sender<()>),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for UploadOp {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
UploadOp::UploadLayer(path, metadata) => {
|
||||
write!(
|
||||
f,
|
||||
"UploadLayer({}, size={:?})",
|
||||
path.file_name(),
|
||||
metadata.file_size()
|
||||
)
|
||||
}
|
||||
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
|
||||
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
|
||||
UploadOp::Barrier(_) => write!(f, "Barrier"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,427 +0,0 @@
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server.
|
||||
|
||||
use std::collections::hash_map;
|
||||
use std::ffi::OsStr;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use tracing::*;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::{Tenant, TenantState};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
mod tenants_state {
|
||||
use once_cell::sync::Lazy;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard},
|
||||
};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::tenant::Tenant;
|
||||
|
||||
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
|
||||
Lazy::new(|| RwLock::new(HashMap::new()));
|
||||
|
||||
pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
|
||||
TENANTS
|
||||
.read()
|
||||
.expect("Failed to read() tenants lock, it got poisoned")
|
||||
}
|
||||
|
||||
pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
|
||||
TENANTS
|
||||
.write()
|
||||
.expect("Failed to write() tenants lock, it got poisoned")
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize repositories with locally available timelines.
|
||||
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
/// are scheduled for download and added to the tenant once download is completed.
|
||||
pub fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<()> {
|
||||
let _entered = info_span!("init_tenant_mgr").entered();
|
||||
|
||||
// Scan local filesystem for attached tenants
|
||||
let mut number_of_tenants = 0;
|
||||
let tenants_dir = conf.tenants_path();
|
||||
for dir_entry in std::fs::read_dir(&tenants_dir)
|
||||
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
|
||||
{
|
||||
match &dir_entry {
|
||||
Ok(dir_entry) => {
|
||||
let tenant_dir_path = dir_entry.path();
|
||||
if crate::is_temporary(&tenant_dir_path) {
|
||||
info!(
|
||||
"Found temporary tenant directory, removing: {}",
|
||||
tenant_dir_path.display()
|
||||
);
|
||||
if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
|
||||
error!(
|
||||
"Failed to remove temporary directory '{}': {:?}",
|
||||
tenant_dir_path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
} else {
|
||||
match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
|
||||
Ok(Some(tenant)) => {
|
||||
tenants_state::write_tenants().insert(tenant.tenant_id(), tenant);
|
||||
number_of_tenants += 1;
|
||||
}
|
||||
Ok(None) => {
|
||||
// This case happens if we crash during attach before creating the attach marker file
|
||||
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
|
||||
error!(
|
||||
"Failed to remove empty tenant directory '{}': {e:#}",
|
||||
tenant_dir_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
|
||||
tenants_dir.display(),
|
||||
dir_entry,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
// On error, print it, but continue with the other tenants. If we error out
|
||||
// here, the pageserver startup fails altogether, causing outage for *all*
|
||||
// tenants. That seems worse.
|
||||
error!(
|
||||
"Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
|
||||
dir_entry,
|
||||
tenants_dir.display(),
|
||||
e,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("Processed {number_of_tenants} local tenants at startup");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn load_local_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Option<Arc<Tenant>>> {
|
||||
if !tenant_path.is_dir() {
|
||||
anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
|
||||
}
|
||||
|
||||
let is_empty = tenant_path
|
||||
.is_empty_dir()
|
||||
.context("check whether tenant_path is an empty dir")?;
|
||||
if is_empty {
|
||||
info!("skipping empty tenant directory {tenant_path:?}");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TenantId>()
|
||||
.context("Could not parse tenant id out of the tenant dir name")?;
|
||||
|
||||
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
|
||||
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
Tenant::spawn_attach(conf, tenant_id, &remote_storage)
|
||||
} else {
|
||||
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
|
||||
Tenant::create_broken_tenant(conf, tenant_id)
|
||||
}
|
||||
} else {
|
||||
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
|
||||
// Start loading the tenant into memory. It will initially be in Loading state.
|
||||
Tenant::spawn_load(conf, tenant_id, remote_storage)
|
||||
};
|
||||
Ok(Some(tenant))
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
pub async fn shutdown_all_tenants() {
|
||||
let tenants_to_shut_down = {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
let mut tenants_to_shut_down = Vec::with_capacity(m.len());
|
||||
for (_, tenant) in m.drain() {
|
||||
if tenant.is_active() {
|
||||
// updates tenant state, forbidding new GC and compaction iterations from starting
|
||||
tenant.set_stopping();
|
||||
tenants_to_shut_down.push(tenant)
|
||||
}
|
||||
}
|
||||
drop(m);
|
||||
tenants_to_shut_down
|
||||
};
|
||||
|
||||
// Shut down all existing walreceiver connections and stop accepting the new ones.
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
|
||||
|
||||
// Ok, no background tasks running anymore. Flush any remaining data in
|
||||
// memory to disk.
|
||||
//
|
||||
// We assume that any incoming connections that might request pages from
|
||||
// the tenant have already been terminated by the caller, so there
|
||||
// should be no more activity in any of the repositories.
|
||||
//
|
||||
// On error, log it but continue with the shutdown for other tenants.
|
||||
for tenant in tenants_to_shut_down {
|
||||
let tenant_id = tenant.tenant_id();
|
||||
debug!("shutdown tenant {tenant_id}");
|
||||
|
||||
if let Err(err) = tenant.checkpoint().await {
|
||||
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
) -> anyhow::Result<Option<Arc<Tenant>>> {
|
||||
match tenants_state::write_tenants().entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(_) => {
|
||||
debug!("tenant {tenant_id} already exists");
|
||||
Ok(None)
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
// Hold the write_tenants() lock, since all of this is local IO.
|
||||
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
|
||||
let tenant_directory =
|
||||
super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
|
||||
let created_tenant = load_local_tenant(conf, &tenant_directory, remote_storage)?;
|
||||
match created_tenant {
|
||||
None => {
|
||||
// We get None in case the directory is empty.
|
||||
// This shouldn't happen here, because we just created the directory.
|
||||
// So, skip any cleanup work for now, we don't know how we reached this state.
|
||||
anyhow::bail!("we just created the tenant directory, it can't be empty");
|
||||
}
|
||||
Some(tenant) => {
|
||||
anyhow::ensure!(
|
||||
tenant_id == tenant.tenant_id(),
|
||||
"loaded created tenant has unexpected tenant id (expect {} != actual {})",
|
||||
tenant_id,
|
||||
tenant.tenant_id()
|
||||
);
|
||||
v.insert(Arc::clone(&tenant));
|
||||
Ok(Some(tenant))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("configuring tenant {tenant_id}");
|
||||
get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf);
|
||||
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
|
||||
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
|
||||
pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
|
||||
let m = tenants_state::read_tenants();
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
|
||||
if active_only && !tenant.is_active() {
|
||||
anyhow::bail!(
|
||||
"Tenant {tenant_id} is not active. Current state: {:?}",
|
||||
tenant.current_state()
|
||||
)
|
||||
} else {
|
||||
Ok(Arc::clone(tenant))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
|
||||
// Start with the shutdown of timeline tasks (this shuts down the walreceiver)
|
||||
// It is important that we do not take locks here, and do not check whether the timeline exists
|
||||
// because if we hold tenants_state::write_tenants() while awaiting for the tasks to join
|
||||
// we cannot create new timelines and tenants, and that can take quite some time,
|
||||
// it can even become stuck due to a bug making whole pageserver unavailable for some operations
|
||||
// so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
|
||||
// and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
|
||||
// will synchronize and either fail with the not found error or succeed
|
||||
|
||||
debug!("waiting for wal receiver to shutdown");
|
||||
task_mgr::shutdown_tasks(
|
||||
Some(TaskKind::WalReceiverManager),
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
)
|
||||
.await;
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
|
||||
info!("timeline task shutdown completed");
|
||||
match get_tenant(tenant_id, true) {
|
||||
Ok(tenant) => {
|
||||
tenant.delete_timeline(timeline_id).await?;
|
||||
}
|
||||
Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant = match {
|
||||
let mut tenants_accessor = tenants_state::write_tenants();
|
||||
tenants_accessor.remove(&tenant_id)
|
||||
} {
|
||||
Some(tenant) => tenant,
|
||||
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
|
||||
};
|
||||
|
||||
tenant.set_stopping();
|
||||
// shutdown all tenant and timeline tasks: gc, compaction, page service)
|
||||
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
|
||||
|
||||
// If removal fails there will be no way to successfully retry detach,
|
||||
// because the tenant no longer exists in the in-memory map. And it needs to be removed from it
|
||||
// before we remove files, because it contains references to tenant
|
||||
// which references ephemeral files which are deleted on drop. So if we keep these references,
|
||||
// we will attempt to remove files which no longer exist. This can be fixed by having shutdown
|
||||
// mechanism for tenant that will clean temporary data to avoid any references to ephemeral files
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
fs::remove_dir_all(&local_tenant_directory).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local tenant directory '{}'",
|
||||
local_tenant_directory.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Get list of tenants, for the mgmt API
|
||||
///
|
||||
pub fn list_tenants() -> Vec<(TenantId, TenantState)> {
|
||||
tenants_state::read_tenants()
|
||||
.iter()
|
||||
.map(|(id, tenant)| (*id, tenant.current_state()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Execute Attach mgmt API command.
|
||||
///
|
||||
/// Downloading all the tenant data is performed in the background, this merely
|
||||
/// spawns the background task and returns quickly.
|
||||
pub async fn attach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
match tenants_state::write_tenants().entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(e) => {
|
||||
// Cannot attach a tenant that already exists. The error message depends on
|
||||
// the state it's in.
|
||||
match e.get().current_state() {
|
||||
TenantState::Attaching => {
|
||||
anyhow::bail!("tenant {tenant_id} attach is already in progress")
|
||||
}
|
||||
current_state => {
|
||||
anyhow::bail!("tenant already exists, current state: {current_state:?}")
|
||||
}
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
|
||||
v.insert(tenant);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn immediate_gc(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
gc_req: TimelineGcRequest,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
|
||||
let guard = tenants_state::read_tenants();
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
|
||||
// Run in task_mgr to avoid race with detach operation
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::GarbageCollector,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
fail::fail_point!("immediate_gc_task_pre");
|
||||
let result = tenant
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send gc result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
);
|
||||
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
@@ -12,7 +12,7 @@
|
||||
//!
|
||||
use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -240,6 +240,10 @@ impl VirtualFile {
|
||||
self.with_file("fsync", |file| file.sync_all())?
|
||||
}
|
||||
|
||||
pub fn metadata(&self) -> Result<fs::Metadata, Error> {
|
||||
self.with_file("metadata", |file| file.metadata())?
|
||||
}
|
||||
|
||||
/// Helper function that looks up the underlying File for this VirtualFile,
|
||||
/// opening it and evicting some other File if necessary. It calls 'func'
|
||||
/// with the physical File.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,7 +6,7 @@
|
||||
//! hence WAL receiver needs to react on such events.
|
||||
//!
|
||||
//! * get a broker subscription, stream data from it to determine that a timeline needs WAL streaming.
|
||||
//! For that, it watches specific keys in etcd broker and pulls the relevant data periodically.
|
||||
//! For that, it watches specific keys in storage_broker and pulls the relevant data periodically.
|
||||
//! The data is produced by safekeepers, that push it periodically and pull it to synchronize between each other.
|
||||
//! Without this data, no WAL streaming is possible currently.
|
||||
//!
|
||||
@@ -26,57 +26,52 @@ mod walreceiver_connection;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use etcd_broker::Client;
|
||||
use itertools::Itertools;
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::future::Future;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::watch;
|
||||
use tracing::*;
|
||||
use url::Url;
|
||||
|
||||
pub use connection_manager::spawn_connection_manager_task;
|
||||
|
||||
static ETCD_CLIENT: OnceCell<Client> = OnceCell::new();
|
||||
static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
|
||||
|
||||
///
|
||||
/// Initialize the etcd client. This must be called once at page server startup.
|
||||
/// Initialize the broker client. This must be called once at page server startup.
|
||||
///
|
||||
pub async fn init_etcd_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
let etcd_endpoints = conf.broker_endpoints.clone();
|
||||
ensure!(
|
||||
!etcd_endpoints.is_empty(),
|
||||
"Cannot start wal receiver: etcd endpoints are empty"
|
||||
);
|
||||
pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
let broker_endpoint = conf.broker_endpoint.clone();
|
||||
|
||||
let etcd_client = Client::connect(etcd_endpoints.clone(), None)
|
||||
.await
|
||||
.context("Failed to connect to etcd")?;
|
||||
// Note: we do not attempt connecting here (but validate endpoints sanity).
|
||||
let broker_client =
|
||||
storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
|
||||
format!(
|
||||
"Failed to create broker client to {}",
|
||||
&conf.broker_endpoint
|
||||
),
|
||||
)?;
|
||||
|
||||
// FIXME: Should we still allow the pageserver to start, if etcd
|
||||
// doesn't work? It could still serve GetPage requests, with the
|
||||
// data it has locally and from what it can download from remote
|
||||
// storage
|
||||
if ETCD_CLIENT.set(etcd_client).is_err() {
|
||||
panic!("etcd already initialized");
|
||||
if BROKER_CLIENT.set(broker_client).is_err() {
|
||||
panic!("broker already initialized");
|
||||
}
|
||||
|
||||
info!(
|
||||
"Initialized etcd client with endpoints: {}",
|
||||
etcd_endpoints.iter().map(Url::to_string).join(", ")
|
||||
"Initialized broker client with endpoints: {}",
|
||||
broker_endpoint
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the etcd client
|
||||
/// Get a handle to the broker client
|
||||
///
|
||||
pub fn get_etcd_client() -> &'static etcd_broker::Client {
|
||||
ETCD_CLIENT.get().expect("etcd client not initialized")
|
||||
pub fn get_broker_client() -> &'static BrokerClientChannel {
|
||||
BROKER_CLIENT.get().expect("broker client not initialized")
|
||||
}
|
||||
|
||||
pub fn is_etcd_client_initialized() -> bool {
|
||||
ETCD_CLIENT.get().is_some()
|
||||
pub fn is_broker_client_initialized() -> bool {
|
||||
BROKER_CLIENT.get().is_some()
|
||||
}
|
||||
|
||||
/// A handle of an asynchronous task.
|
||||
@@ -134,15 +129,21 @@ impl<E: Clone> TaskHandle<E> {
|
||||
match self.events_receiver.changed().await {
|
||||
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
|
||||
Err(_task_channel_part_dropped) => {
|
||||
TaskEvent::End(match self.join_handle.take() {
|
||||
TaskEvent::End(match self.join_handle.as_mut() {
|
||||
Some(jh) => {
|
||||
if !jh.is_finished() {
|
||||
warn!("sender is dropped while join handle is still alive");
|
||||
}
|
||||
|
||||
jh.await
|
||||
let res = jh
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
|
||||
.and_then(|x| x)
|
||||
.and_then(|x| x);
|
||||
|
||||
// For cancellation-safety, drop join_handle only after successful .await.
|
||||
self.join_handle = None;
|
||||
|
||||
res
|
||||
}
|
||||
None => {
|
||||
// Another option is to have an enum, join handle or result and give away the reference to it
|
||||
|
||||
@@ -1,21 +1,15 @@
|
||||
//! WAL receiver logic that ensures the pageserver gets connectected to safekeeper,
|
||||
//! that contains the latest WAL to stream and this connection does not go stale.
|
||||
//!
|
||||
//! To achieve that, a etcd broker is used: safekepers propagate their timelines' state in it,
|
||||
//! To achieve that, a storage broker is used: safekepers propagate their timelines' state in it,
|
||||
//! the manager subscribes for changes and accumulates those to query the one with the biggest Lsn for connection.
|
||||
//! Current connection state is tracked too, to ensure it's not getting stale.
|
||||
//!
|
||||
//! After every connection or etcd update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
|
||||
//! After every connection or storage broker update fetched, the state gets updated correspondingly and rechecked for the new conneciton leader,
|
||||
//! then a [re]connection happens, if necessary.
|
||||
//! Only WAL streaming task expects to be finished, other loops (etcd, connection management) never exit unless cancelled explicitly via the dedicated channel.
|
||||
//! Only WAL streaming task expects to be finished, other loops (storage broker, connection management) never exit unless cancelled explicitly via the dedicated channel.
|
||||
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
num::NonZeroU64,
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
||||
@@ -23,16 +17,18 @@ use crate::tenant::Timeline;
|
||||
use crate::{task_mgr, walreceiver::TaskStateUpdate};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use etcd_broker::{
|
||||
subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
|
||||
BrokerUpdate, Client,
|
||||
};
|
||||
use pageserver_api::models::TimelineState;
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use storage_broker::Streaming;
|
||||
use tokio::{select, sync::watch};
|
||||
use tracing::*;
|
||||
|
||||
use crate::{
|
||||
exponential_backoff, walreceiver::get_etcd_client, DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
@@ -45,14 +41,13 @@ use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
|
||||
|
||||
/// Spawns the loop to take care of the timeline's WAL streaming connection.
|
||||
pub fn spawn_connection_manager_task(
|
||||
broker_loop_prefix: String,
|
||||
timeline: Arc<Timeline>,
|
||||
wal_connect_timeout: Duration,
|
||||
lagging_wal_timeout: Duration,
|
||||
max_lsn_wal_lag: NonZeroU64,
|
||||
auth_token: Option<Arc<String>>,
|
||||
) {
|
||||
let mut etcd_client = get_etcd_client().clone();
|
||||
let mut broker_client = get_broker_client().clone();
|
||||
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
@@ -65,7 +60,7 @@ pub fn spawn_connection_manager_task(
|
||||
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
|
||||
false,
|
||||
async move {
|
||||
info!("WAL receiver broker started, connecting to etcd");
|
||||
info!("WAL receiver manager started, connecting to broker");
|
||||
let mut walreceiver_state = WalreceiverState::new(
|
||||
timeline,
|
||||
wal_connect_timeout,
|
||||
@@ -81,8 +76,7 @@ pub fn spawn_connection_manager_task(
|
||||
return Ok(());
|
||||
},
|
||||
loop_step_result = connection_manager_loop_step(
|
||||
&broker_loop_prefix,
|
||||
&mut etcd_client,
|
||||
&mut broker_client,
|
||||
&mut walreceiver_state,
|
||||
) => match loop_step_result {
|
||||
ControlFlow::Continue(()) => continue,
|
||||
@@ -103,10 +97,9 @@ pub fn spawn_connection_manager_task(
|
||||
|
||||
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
|
||||
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
|
||||
/// If etcd subscription is cancelled, exits.
|
||||
/// If storage broker subscription is cancelled, exits.
|
||||
async fn connection_manager_loop_step(
|
||||
broker_prefix: &str,
|
||||
etcd_client: &mut Client,
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
walreceiver_state: &mut WalreceiverState,
|
||||
) -> ControlFlow<(), ()> {
|
||||
let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
|
||||
@@ -124,13 +117,11 @@ async fn connection_manager_loop_step(
|
||||
timeline_id: walreceiver_state.timeline.timeline_id,
|
||||
};
|
||||
|
||||
// XXX: We never explicitly cancel etcd task, instead establishing one and never letting it go,
|
||||
// running the entire loop step as much as possible to an end.
|
||||
// The task removal happens implicitly on drop, both aborting the etcd subscription task and dropping the receiver channel end,
|
||||
// forcing the etcd subscription to exit either way.
|
||||
let mut broker_subscription =
|
||||
subscribe_for_timeline_updates(etcd_client, broker_prefix, id).await;
|
||||
info!("Subscribed for etcd timeline changes, waiting for new etcd data");
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
|
||||
info!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
let time_until_next_retry = walreceiver_state.time_until_next_retry();
|
||||
@@ -145,12 +136,6 @@ async fn connection_manager_loop_step(
|
||||
// - this might change the current desired connection
|
||||
// - timeline state changes to something that does not allow walreceiver to run concurrently
|
||||
select! {
|
||||
broker_connection_result = &mut broker_subscription.watcher_handle => {
|
||||
info!("Broker connection was closed from the other side, ending current broker loop step");
|
||||
cleanup_broker_connection(broker_connection_result, walreceiver_state);
|
||||
return ControlFlow::Continue(());
|
||||
},
|
||||
|
||||
Some(wal_connection_update) = async {
|
||||
match walreceiver_state.wal_connection.as_mut() {
|
||||
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
|
||||
@@ -160,21 +145,17 @@ async fn connection_manager_loop_step(
|
||||
let wal_connection = walreceiver_state.wal_connection.as_mut()
|
||||
.expect("Should have a connection, as checked by the corresponding select! guard");
|
||||
match wal_connection_update {
|
||||
TaskEvent::Update(c) => {
|
||||
match c {
|
||||
TaskStateUpdate::Init | TaskStateUpdate::Started => {},
|
||||
TaskStateUpdate::Progress(status) => {
|
||||
if status.has_processed_wal {
|
||||
// We have advanced last_record_lsn by processing the WAL received
|
||||
// from this safekeeper. This is good enough to clean unsuccessful
|
||||
// retries history and allow reconnecting to this safekeeper without
|
||||
// sleeping for a long time.
|
||||
walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
|
||||
}
|
||||
wal_connection.status = status.to_owned();
|
||||
}
|
||||
TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
|
||||
TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
|
||||
if new_status.has_processed_wal {
|
||||
// We have advanced last_record_lsn by processing the WAL received
|
||||
// from this safekeeper. This is good enough to clean unsuccessful
|
||||
// retries history and allow reconnecting to this safekeeper without
|
||||
// sleeping for a long time.
|
||||
walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
|
||||
}
|
||||
},
|
||||
wal_connection.status = new_status;
|
||||
}
|
||||
TaskEvent::End(walreceiver_task_result) => {
|
||||
match walreceiver_task_result {
|
||||
Ok(()) => debug!("WAL receiving task finished"),
|
||||
@@ -185,22 +166,16 @@ async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
// Got a new update from etcd
|
||||
broker_update = broker_subscription.value_updates.recv() => {
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() => {
|
||||
match broker_update {
|
||||
Some(broker_update) => walreceiver_state.register_timeline_update(broker_update),
|
||||
None => {
|
||||
info!("Broker sender end was dropped, ending current broker loop step");
|
||||
// Ensure to cancel and wait for the broker subscription task end, to log its result.
|
||||
// Broker sender end is in the broker subscription task and its drop means abnormal task completion.
|
||||
// First, ensure that the task is stopped (abort can be done without errors on already stopped tasks and repeated multiple times).
|
||||
broker_subscription.watcher_handle.abort();
|
||||
// Then, wait for the task to finish and print its result. If the task was finished before abort (which we assume in this abnormal case),
|
||||
// a proper error message will be printed, otherwise an abortion message is printed which is ok, since we're signalled to finish anyway.
|
||||
cleanup_broker_connection(
|
||||
(&mut broker_subscription.watcher_handle).await,
|
||||
walreceiver_state,
|
||||
);
|
||||
Ok(Some(broker_update)) => walreceiver_state.register_timeline_update(broker_update),
|
||||
Err(e) => {
|
||||
error!("broker subscription failed: {e}");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
@@ -231,18 +206,18 @@ async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
_ = async { tokio::time::sleep(time_until_next_retry.unwrap()).await }, if time_until_next_retry.is_some() => {}
|
||||
}
|
||||
|
||||
// Fetch more etcd timeline updates, but limit ourselves since they may arrive quickly.
|
||||
let mut max_events_to_poll = 100_u32;
|
||||
while max_events_to_poll > 0 {
|
||||
if let Ok(broker_update) = broker_subscription.value_updates.try_recv() {
|
||||
walreceiver_state.register_timeline_update(broker_update);
|
||||
max_events_to_poll -= 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
Some(()) = async {
|
||||
match time_until_next_retry {
|
||||
Some(sleep_time) => {
|
||||
tokio::time::sleep(sleep_time).await;
|
||||
Some(())
|
||||
},
|
||||
None => {
|
||||
debug!("No candidates to retry, waiting indefinitely for the broker events");
|
||||
None
|
||||
}
|
||||
}
|
||||
} => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"),
|
||||
}
|
||||
|
||||
if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
|
||||
@@ -285,33 +260,11 @@ async fn wait_for_active_timeline(
|
||||
}
|
||||
}
|
||||
|
||||
fn cleanup_broker_connection(
|
||||
broker_connection_result: Result<Result<(), etcd_broker::BrokerError>, tokio::task::JoinError>,
|
||||
walreceiver_state: &mut WalreceiverState,
|
||||
) {
|
||||
match broker_connection_result {
|
||||
Ok(Ok(())) => info!("Broker conneciton task finished, ending current broker loop step"),
|
||||
Ok(Err(broker_error)) => warn!("Broker conneciton ended with error: {broker_error}"),
|
||||
Err(abort_error) => {
|
||||
if abort_error.is_panic() {
|
||||
error!("Broker connection panicked: {abort_error}")
|
||||
} else {
|
||||
debug!("Broker connection aborted: {abort_error}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
walreceiver_state.wal_stream_candidates.clear();
|
||||
}
|
||||
|
||||
/// Endlessly try to subscribe for broker updates for a given timeline.
|
||||
/// If there are no safekeepers to maintain the lease, the timeline subscription will be unavailable in the broker and the operation will fail constantly.
|
||||
/// This is ok, pageservers should anyway try subscribing (with some backoff) since it's the only way they can get the timeline WAL anyway.
|
||||
async fn subscribe_for_timeline_updates(
|
||||
etcd_client: &mut Client,
|
||||
broker_prefix: &str,
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
) -> BrokerSubscription<SkTimelineInfo> {
|
||||
) -> Streaming<SafekeeperTimelineInfo> {
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
exponential_backoff(
|
||||
@@ -322,18 +275,21 @@ async fn subscribe_for_timeline_updates(
|
||||
.await;
|
||||
attempt += 1;
|
||||
|
||||
match etcd_broker::subscribe_for_json_values(
|
||||
etcd_client,
|
||||
SubscriptionKey::sk_timeline_info(broker_prefix.to_owned(), id),
|
||||
)
|
||||
.instrument(info_span!("etcd_subscription"))
|
||||
.await
|
||||
{
|
||||
Ok(new_subscription) => {
|
||||
return new_subscription;
|
||||
// subscribe to the specific timeline
|
||||
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
});
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
subscription_key: Some(key),
|
||||
};
|
||||
|
||||
match broker_client.subscribe_safekeeper_info(request).await {
|
||||
Ok(resp) => {
|
||||
return resp.into_inner();
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in etcd: {e:#}");
|
||||
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -360,8 +316,8 @@ struct WalreceiverState {
|
||||
wal_connection: Option<WalConnection>,
|
||||
/// Info about retries and unsuccessful attempts to connect to safekeepers.
|
||||
wal_connection_retries: HashMap<NodeId, RetryInfo>,
|
||||
/// Data about all timelines, available for connection, fetched from etcd, grouped by their corresponding safekeeper node id.
|
||||
wal_stream_candidates: HashMap<NodeId, EtcdSkTimeline>,
|
||||
/// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id.
|
||||
wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
|
||||
auth_token: Option<Arc<String>>,
|
||||
}
|
||||
|
||||
@@ -395,13 +351,11 @@ struct RetryInfo {
|
||||
retry_duration_seconds: f64,
|
||||
}
|
||||
|
||||
/// Data about the timeline to connect to, received from etcd.
|
||||
/// Data about the timeline to connect to, received from the broker.
|
||||
#[derive(Debug)]
|
||||
struct EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo,
|
||||
/// Etcd generation, the bigger it is, the more up to date the timeline data is.
|
||||
etcd_version: i64,
|
||||
/// Time at which the data was fetched from etcd last time, to track the stale data.
|
||||
struct BrokerSkTimeline {
|
||||
timeline: SafekeeperTimelineInfo,
|
||||
/// Time at which the data was fetched from the broker last time, to track the stale data.
|
||||
latest_update: NaiveDateTime,
|
||||
}
|
||||
|
||||
@@ -453,7 +407,7 @@ impl WalreceiverState {
|
||||
.await
|
||||
.context("walreceiver connection handling failure")
|
||||
}
|
||||
.instrument(info_span!("walreceiver_connection", id = %id))
|
||||
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
|
||||
});
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -533,36 +487,28 @@ impl WalreceiverState {
|
||||
.values()
|
||||
.filter_map(|retry| retry.next_retry_at)
|
||||
.filter(|next_retry_at| next_retry_at > &now)
|
||||
.min();
|
||||
.min()?;
|
||||
|
||||
next_retry_at.and_then(|next_retry_at| (next_retry_at - now).to_std().ok())
|
||||
(next_retry_at - now).to_std().ok()
|
||||
}
|
||||
|
||||
/// Adds another etcd timeline into the state, if its more recent than the one already added there for the same key.
|
||||
fn register_timeline_update(&mut self, timeline_update: BrokerUpdate<SkTimelineInfo>) {
|
||||
match self
|
||||
.wal_stream_candidates
|
||||
.entry(timeline_update.key.node_id)
|
||||
{
|
||||
hash_map::Entry::Occupied(mut o) => {
|
||||
let existing_value = o.get_mut();
|
||||
if existing_value.etcd_version < timeline_update.etcd_version {
|
||||
existing_value.etcd_version = timeline_update.etcd_version;
|
||||
existing_value.timeline = timeline_update.value;
|
||||
existing_value.latest_update = Utc::now().naive_utc();
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(EtcdSkTimeline {
|
||||
timeline: timeline_update.value,
|
||||
etcd_version: timeline_update.etcd_version,
|
||||
latest_update: Utc::now().naive_utc(),
|
||||
});
|
||||
}
|
||||
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
|
||||
fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) {
|
||||
let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
|
||||
let old_entry = self.wal_stream_candidates.insert(
|
||||
new_safekeeper_id,
|
||||
BrokerSkTimeline {
|
||||
timeline: timeline_update,
|
||||
latest_update: Utc::now().naive_utc(),
|
||||
},
|
||||
);
|
||||
|
||||
if old_entry.is_none() {
|
||||
info!("New SK node was added: {new_safekeeper_id}");
|
||||
}
|
||||
}
|
||||
|
||||
/// Cleans up stale etcd records and checks the rest for the new connection candidate.
|
||||
/// Cleans up stale broker records and checks the rest for the new connection candidate.
|
||||
/// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise.
|
||||
/// The current rules for approving new candidates:
|
||||
/// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps
|
||||
@@ -585,7 +531,7 @@ impl WalreceiverState {
|
||||
Some(existing_wal_connection) => {
|
||||
let connected_sk_node = existing_wal_connection.sk_id;
|
||||
|
||||
let (new_sk_id, new_safekeeper_etcd_data, new_wal_source_connconf) =
|
||||
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
|
||||
self.select_connection_candidate(Some(connected_sk_node))?;
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -614,7 +560,7 @@ impl WalreceiverState {
|
||||
}
|
||||
|
||||
if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn {
|
||||
let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
|
||||
let new_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
|
||||
// Check if the new candidate has much more WAL than the current one.
|
||||
match new_commit_lsn.0.checked_sub(current_commit_lsn.0) {
|
||||
Some(new_sk_lsn_advantage) => {
|
||||
@@ -644,7 +590,7 @@ impl WalreceiverState {
|
||||
.status
|
||||
.commit_lsn
|
||||
.unwrap_or(current_lsn);
|
||||
let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
|
||||
let candidate_commit_lsn = Lsn(new_safekeeper_broker_data.commit_lsn);
|
||||
|
||||
// Keep discovered_new_wal only if connected safekeeper has not caught up yet.
|
||||
let mut discovered_new_wal = existing_wal_connection
|
||||
@@ -727,7 +673,7 @@ impl WalreceiverState {
|
||||
None
|
||||
}
|
||||
|
||||
/// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers.
|
||||
/// Selects the best possible candidate, based on the data collected from the broker updates about the safekeepers.
|
||||
/// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
|
||||
///
|
||||
/// The candidate that is chosen:
|
||||
@@ -736,7 +682,7 @@ impl WalreceiverState {
|
||||
fn select_connection_candidate(
|
||||
&self,
|
||||
node_to_omit: Option<NodeId>,
|
||||
) -> Option<(NodeId, &SkTimelineInfo, PgConnectionConfig)> {
|
||||
) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
|
||||
self.applicable_connection_candidates()
|
||||
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
|
||||
.max_by_key(|(_, info, _)| info.commit_lsn)
|
||||
@@ -746,12 +692,12 @@ impl WalreceiverState {
|
||||
/// Some safekeepers are filtered by the retry cooldown.
|
||||
fn applicable_connection_candidates(
|
||||
&self,
|
||||
) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, PgConnectionConfig)> {
|
||||
) -> impl Iterator<Item = (NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> {
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
self.wal_stream_candidates
|
||||
.iter()
|
||||
.filter(|(_, info)| info.timeline.commit_lsn.is_some())
|
||||
.filter(|(_, info)| Lsn(info.timeline.commit_lsn) != Lsn::INVALID)
|
||||
.filter(move |(sk_id, _)| {
|
||||
let next_retry_at = self
|
||||
.wal_connection_retries
|
||||
@@ -761,12 +707,14 @@ impl WalreceiverState {
|
||||
});
|
||||
|
||||
next_retry_at.is_none() || next_retry_at.unwrap() <= now
|
||||
})
|
||||
.filter_map(|(sk_id, etcd_info)| {
|
||||
let info = &etcd_info.timeline;
|
||||
}).filter_map(|(sk_id, broker_info)| {
|
||||
let info = &broker_info.timeline;
|
||||
if info.safekeeper_connstr.is_empty() {
|
||||
return None; // no connection string, ignore sk
|
||||
}
|
||||
match wal_stream_connection_config(
|
||||
self.id,
|
||||
info.safekeeper_connstr.as_deref()?,
|
||||
info.safekeeper_connstr.as_ref(),
|
||||
match &self.auth_token {
|
||||
None => None,
|
||||
Some(x) => Some(x),
|
||||
@@ -781,15 +729,16 @@ impl WalreceiverState {
|
||||
})
|
||||
}
|
||||
|
||||
/// Remove candidates which haven't sent etcd updates for a while.
|
||||
/// Remove candidates which haven't sent broker updates for a while.
|
||||
fn cleanup_old_candidates(&mut self) {
|
||||
let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
|
||||
let lagging_wal_timeout = self.lagging_wal_timeout;
|
||||
|
||||
self.wal_stream_candidates.retain(|node_id, etcd_info| {
|
||||
if let Ok(time_since_latest_etcd_update) =
|
||||
(Utc::now().naive_utc() - etcd_info.latest_update).to_std()
|
||||
self.wal_stream_candidates.retain(|node_id, broker_info| {
|
||||
if let Ok(time_since_latest_broker_update) =
|
||||
(Utc::now().naive_utc() - broker_info.latest_update).to_std()
|
||||
{
|
||||
let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout;
|
||||
let should_retain = time_since_latest_broker_update < lagging_wal_timeout;
|
||||
if !should_retain {
|
||||
node_ids_to_remove.push(*node_id);
|
||||
}
|
||||
@@ -799,8 +748,11 @@ impl WalreceiverState {
|
||||
}
|
||||
});
|
||||
|
||||
for node_id in node_ids_to_remove {
|
||||
self.wal_connection_retries.remove(&node_id);
|
||||
if !node_ids_to_remove.is_empty() {
|
||||
for node_id in node_ids_to_remove {
|
||||
info!("Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections");
|
||||
self.wal_connection_retries.remove(&node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -853,7 +805,7 @@ fn wal_stream_connection_config(
|
||||
auth_token: Option<&str>,
|
||||
) -> anyhow::Result<PgConnectionConfig> {
|
||||
let (host, port) =
|
||||
parse_host_port(&listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
|
||||
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
|
||||
let port = port.unwrap_or(5432);
|
||||
Ok(PgConnectionConfig::new_host_port(host, port)
|
||||
.extend_options([
|
||||
@@ -870,6 +822,28 @@ mod tests {
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use url::Host;
|
||||
|
||||
fn dummy_broker_sk_timeline(
|
||||
commit_lsn: u64,
|
||||
safekeeper_connstr: &str,
|
||||
latest_update: NaiveDateTime,
|
||||
) -> BrokerSkTimeline {
|
||||
BrokerSkTimeline {
|
||||
timeline: SafekeeperTimelineInfo {
|
||||
safekeeper_id: 0,
|
||||
tenant_timeline_id: None,
|
||||
last_log_term: 0,
|
||||
flush_lsn: 0,
|
||||
commit_lsn,
|
||||
backup_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
peer_horizon_lsn: 0,
|
||||
local_start_lsn: 0,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
},
|
||||
latest_update,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("no_connection_no_candidate")?;
|
||||
@@ -881,74 +855,16 @@ mod tests {
|
||||
|
||||
state.wal_connection = None;
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(1)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: None,
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("no_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
),
|
||||
(
|
||||
NodeId(2),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: None,
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: Some("no_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
),
|
||||
(NodeId(0), dummy_broker_sk_timeline(1, "", now)),
|
||||
(NodeId(1), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
|
||||
(NodeId(2), dummy_broker_sk_timeline(0, "no_commit_lsn", now)),
|
||||
(
|
||||
NodeId(3),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: delay_over_threshold,
|
||||
},
|
||||
dummy_broker_sk_timeline(
|
||||
1 + state.max_lsn_wal_lag.get(),
|
||||
"delay_over_threshold",
|
||||
delay_over_threshold,
|
||||
),
|
||||
),
|
||||
]);
|
||||
|
||||
@@ -983,10 +899,10 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
status: connection_status.clone(),
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status.clone()))
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
@@ -995,57 +911,23 @@ mod tests {
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
connected_sk_id,
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() * 2)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(
|
||||
current_lsn + state.max_lsn_wal_lag.get() * 2,
|
||||
DUMMY_SAFEKEEPER_HOST,
|
||||
now,
|
||||
),
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(current_lsn)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("not_advanced_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(current_lsn, "not_advanced_lsn", now),
|
||||
),
|
||||
(
|
||||
NodeId(2),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(current_lsn + state.max_lsn_wal_lag.get() / 2)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(
|
||||
current_lsn + state.max_lsn_wal_lag.get() / 2,
|
||||
"not_enough_advanced_lsn",
|
||||
now,
|
||||
),
|
||||
),
|
||||
]);
|
||||
|
||||
@@ -1067,21 +949,7 @@ mod tests {
|
||||
state.wal_connection = None;
|
||||
state.wal_stream_candidates = HashMap::from([(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(1 + state.max_lsn_wal_lag.get(), DUMMY_SAFEKEEPER_HOST, now),
|
||||
)]);
|
||||
|
||||
let only_candidate = state
|
||||
@@ -1102,57 +970,15 @@ mod tests {
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(selected_lsn - 100)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(selected_lsn - 100, "smaller_commit_lsn", now),
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(selected_lsn)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(selected_lsn, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
(
|
||||
NodeId(2),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(Lsn(selected_lsn + 100)),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: None,
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(selected_lsn + 100, "", now),
|
||||
),
|
||||
]);
|
||||
let biggest_wal_candidate = state.next_connection_candidate().expect(
|
||||
@@ -1186,39 +1012,11 @@ mod tests {
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(bigger_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(bigger_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(current_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
]);
|
||||
state.wal_connection_retries = HashMap::from([(
|
||||
@@ -1263,10 +1061,10 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
status: connection_status.clone(),
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status.clone()))
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
@@ -1275,39 +1073,11 @@ mod tests {
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
connected_sk_id,
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(current_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(new_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(new_lsn.0, "advanced_by_lsn_safekeeper", now),
|
||||
),
|
||||
]);
|
||||
|
||||
@@ -1356,10 +1126,10 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: NodeId(1),
|
||||
status: connection_status.clone(),
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status.clone()))
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
@@ -1367,21 +1137,7 @@ mod tests {
|
||||
});
|
||||
state.wal_stream_candidates = HashMap::from([(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(current_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
)]);
|
||||
|
||||
let over_threshcurrent_candidate = state.next_connection_candidate().expect(
|
||||
@@ -1441,21 +1197,7 @@ mod tests {
|
||||
});
|
||||
state.wal_stream_candidates = HashMap::from([(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(new_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
local_start_lsn: None,
|
||||
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_HOST.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
dummy_broker_sk_timeline(new_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
)]);
|
||||
|
||||
let over_threshcurrent_candidate = state.next_connection_candidate().expect(
|
||||
|
||||
@@ -20,7 +20,9 @@ use tokio::{pin, select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
|
||||
use crate::{
|
||||
metrics::LIVE_CONNECTIONS_COUNT, tenant::with_ondemand_download, walreceiver::TaskStateUpdate,
|
||||
};
|
||||
use crate::{
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
@@ -35,7 +37,7 @@ use pq_proto::ReplicationFeedback;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Status of the connection.
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct WalConnectionStatus {
|
||||
/// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
|
||||
pub is_connected: bool,
|
||||
@@ -83,7 +85,7 @@ pub async fn handle_walreceiver_connection(
|
||||
streaming_lsn: None,
|
||||
commit_lsn: None,
|
||||
};
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
|
||||
warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -135,7 +137,7 @@ pub async fn handle_walreceiver_connection(
|
||||
connection_status.latest_connection_update = Utc::now().naive_utc();
|
||||
connection_status.latest_wal_update = Utc::now().naive_utc();
|
||||
connection_status.commit_lsn = Some(end_of_wal);
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
|
||||
warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -173,7 +175,8 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?;
|
||||
let mut walingest =
|
||||
with_ondemand_download(|| WalIngest::new(timeline.as_ref(), startpoint)).await?;
|
||||
|
||||
while let Some(replication_message) = {
|
||||
select! {
|
||||
@@ -184,7 +187,20 @@ pub async fn handle_walreceiver_connection(
|
||||
replication_message = physical_stream.next() => replication_message,
|
||||
}
|
||||
} {
|
||||
let replication_message = replication_message?;
|
||||
let replication_message = match replication_message {
|
||||
Ok(message) => message,
|
||||
Err(replication_error) => {
|
||||
if replication_error.is_closed() {
|
||||
info!("Replication stream got closed");
|
||||
return Ok(());
|
||||
} else {
|
||||
return Err(
|
||||
anyhow::Error::new(replication_error).context("replication stream error")
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
let last_rec_lsn_before_msg = last_rec_lsn;
|
||||
|
||||
@@ -207,7 +223,7 @@ pub async fn handle_walreceiver_connection(
|
||||
}
|
||||
&_ => {}
|
||||
};
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
|
||||
warn!("Wal connection event listener dropped, aborting the connection: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -235,9 +251,16 @@ pub async fn handle_walreceiver_connection(
|
||||
// at risk of hitting a deadlock.
|
||||
ensure!(lsn.is_aligned());
|
||||
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded)
|
||||
.context("could not ingest record at {lsn}")?;
|
||||
with_ondemand_download(|| {
|
||||
walingest.ingest_record(
|
||||
recdata.clone(),
|
||||
lsn,
|
||||
&mut modification,
|
||||
&mut decoded,
|
||||
)
|
||||
})
|
||||
.await
|
||||
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
||||
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
@@ -273,8 +296,7 @@ pub async fn handle_walreceiver_connection(
|
||||
if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg {
|
||||
// We have successfully processed at least one WAL record.
|
||||
connection_status.has_processed_wal = true;
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone()))
|
||||
{
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status)) {
|
||||
warn!("Wal connection event listener dropped, aborting the connection: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -313,10 +335,11 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let (timeline_logical_size, _) = timeline
|
||||
.get_current_logical_size()
|
||||
.context("Status update creation failed to get current logical size")?;
|
||||
let status_update = ReplicationFeedback {
|
||||
current_timeline_size: timeline
|
||||
.get_current_logical_size()
|
||||
.context("Status update creation failed to get current logical size")?,
|
||||
current_timeline_size: timeline_logical_size,
|
||||
ps_writelsn: write_lsn,
|
||||
ps_flushlsn: flush_lsn,
|
||||
ps_applylsn: apply_lsn,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//!
|
||||
//! Functions for parsing WAL records.
|
||||
//!
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi::pg_constants;
|
||||
|
||||
@@ -84,7 +84,7 @@ pub trait WalRedoManager: Send + Sync {
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
@@ -147,7 +147,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
@@ -156,7 +156,8 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
return Err(WalRedoError::InvalidRequest);
|
||||
}
|
||||
|
||||
let mut img: Option<Bytes> = base_img;
|
||||
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
|
||||
let mut img = base_img.map(|p| p.1);
|
||||
let mut batch_neon = can_apply_in_neon(&records[0].1);
|
||||
let mut batch_start = 0;
|
||||
for i in 1..records.len() {
|
||||
@@ -170,6 +171,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
key,
|
||||
lsn,
|
||||
img,
|
||||
base_img_lsn,
|
||||
&records[batch_start..i],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
@@ -189,6 +191,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
key,
|
||||
lsn,
|
||||
img,
|
||||
base_img_lsn,
|
||||
&records[batch_start..],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
@@ -223,11 +226,13 @@ impl PostgresRedoManager {
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn apply_batch_postgres(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
base_img_lsn: Lsn,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
@@ -282,9 +287,12 @@ impl PostgresRedoManager {
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
error!(
|
||||
"error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
|
||||
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
nbytes,
|
||||
base_img_lsn,
|
||||
lsn
|
||||
);
|
||||
let process = process_guard.take().unwrap();
|
||||
@@ -401,7 +409,7 @@ impl PostgresRedoManager {
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
@@ -451,7 +459,7 @@ impl PostgresRedoManager {
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
@@ -639,7 +647,7 @@ impl PostgresRedoProcess {
|
||||
|
||||
info!("running initdb in {}", datadir.display());
|
||||
let initdb = Command::new(pg_bin_dir_path.join("initdb"))
|
||||
.args(&["-D", &datadir.to_string_lossy()])
|
||||
.args(["-D", &datadir.to_string_lossy()])
|
||||
.arg("-N")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
@@ -922,8 +930,7 @@ impl NoLeakChild {
|
||||
|
||||
match child.wait() {
|
||||
Ok(exit_status) => {
|
||||
// log at error level since .kill() is something we only do on errors ATM
|
||||
error!(exit_status = %exit_status, "wait successful");
|
||||
info!(exit_status = %exit_status, "wait successful");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
|
||||
|
||||
Reference in New Issue
Block a user