mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 10:30:40 +00:00
Merge WITH CONFLICTS 2025-03-11 main commit '7c462b3417ecd3ae3907f3480f3b8a8c99fc6d7b' into yuchen/dire
ct-io-delta-image-layer-write Conflicts: pageserver/src/tenant/blob_io.rs
This commit is contained in:
@@ -33,8 +33,9 @@ use utils::lsn::Lsn;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::storage_layer::IoConcurrency;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BasebackupError {
|
||||
@@ -42,6 +43,26 @@ pub enum BasebackupError {
|
||||
Server(#[from] anyhow::Error),
|
||||
#[error("basebackup client error {0:#} when {1}")]
|
||||
Client(#[source] io::Error, &'static str),
|
||||
#[error("basebackup during shutdown")]
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for BasebackupError {
|
||||
fn from(value: PageReconstructError) -> Self {
|
||||
match value {
|
||||
PageReconstructError::Cancelled => BasebackupError::Shutdown,
|
||||
err => BasebackupError::Server(err.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetVectoredError> for BasebackupError {
|
||||
fn from(value: GetVectoredError) -> Self {
|
||||
match value {
|
||||
GetVectoredError::Cancelled => BasebackupError::Shutdown,
|
||||
err => BasebackupError::Server(err.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Create basebackup with non-rel data in it.
|
||||
@@ -127,7 +148,7 @@ where
|
||||
timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?,
|
||||
.map_err(|_| BasebackupError::Shutdown)?,
|
||||
),
|
||||
};
|
||||
basebackup
|
||||
@@ -323,8 +344,7 @@ where
|
||||
let slru_partitions = self
|
||||
.timeline
|
||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.await?
|
||||
.partition(
|
||||
self.timeline.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
@@ -336,11 +356,10 @@ where
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
let block = block?;
|
||||
slru_builder.add_block(&key, block).await?;
|
||||
}
|
||||
}
|
||||
@@ -349,11 +368,8 @@ where
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in self
|
||||
.timeline
|
||||
.list_dbdirs(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
||||
{
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
|
||||
@@ -362,8 +378,7 @@ where
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
// contents of UNLOGGED relations. Postgres copies it in
|
||||
@@ -391,8 +406,7 @@ where
|
||||
let aux_files = self
|
||||
.timeline
|
||||
.list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
let aux_scan_time = start_time.elapsed();
|
||||
let aux_estimated_size = aux_files
|
||||
.values()
|
||||
@@ -451,16 +465,14 @@ where
|
||||
for xid in self
|
||||
.timeline
|
||||
.list_twophase_files(self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.await?
|
||||
{
|
||||
self.add_twophase_file(xid).await?;
|
||||
}
|
||||
let repl_origins = self
|
||||
.timeline
|
||||
.get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
let n_origins = repl_origins.len();
|
||||
if n_origins != 0 {
|
||||
//
|
||||
@@ -505,8 +517,7 @@ where
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
if nblocks == 0 {
|
||||
@@ -532,8 +543,7 @@ where
|
||||
// TODO: investigate using get_vectored for the entire startblk..endblk range.
|
||||
// But this code path is not on the critical path for most basebackups (?).
|
||||
.get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
|
||||
@@ -567,8 +577,7 @@ where
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
|
||||
if img.len()
|
||||
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
|
||||
@@ -622,8 +631,7 @@ where
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?
|
||||
.await?
|
||||
.is_empty()
|
||||
{
|
||||
return Ok(());
|
||||
@@ -674,8 +682,7 @@ where
|
||||
let img = self
|
||||
.timeline
|
||||
.get_twophase_file(xid, self.lsn, self.ctx)
|
||||
.await
|
||||
.map_err(|e| BasebackupError::Server(e.into()))?;
|
||||
.await?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
|
||||
@@ -14,6 +14,7 @@ use camino::Utf8Path;
|
||||
use clap::{Arg, ArgAction, Command};
|
||||
use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
|
||||
use metrics::set_build_info_metric;
|
||||
use nix::sys::socket::{setsockopt, sockopt};
|
||||
use pageserver::config::{PageServerConf, PageserverIdentity};
|
||||
use pageserver::controller_upcall_client::ControllerUpcallClient;
|
||||
use pageserver::deletion_queue::DeletionQueue;
|
||||
@@ -24,11 +25,12 @@ use pageserver::task_mgr::{
|
||||
};
|
||||
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service,
|
||||
task_mgr, virtual_file,
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
|
||||
page_cache, page_service, task_mgr, virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use rustls_pki_types::{CertificateDer, PrivateKeyDer};
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -342,11 +344,25 @@ fn start_pageserver(
|
||||
info!("Starting pageserver http handler on {http_addr}");
|
||||
let http_listener = tcp_listener::bind(http_addr)?;
|
||||
|
||||
let pg_addr = &conf.listen_pg_addr;
|
||||
let https_listener = match conf.listen_https_addr.as_ref() {
|
||||
Some(https_addr) => {
|
||||
info!("Starting pageserver https handler on {https_addr}");
|
||||
Some(tcp_listener::bind(https_addr)?)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let pg_addr = &conf.listen_pg_addr;
|
||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
// Enable SO_KEEPALIVE on the socket, to detect dead connections faster.
|
||||
// These are configured via net.ipv4.tcp_keepalive_* sysctls.
|
||||
//
|
||||
// TODO: also set this on the walreceiver socket, but tokio-postgres doesn't
|
||||
// support enabling keepalives while using the default OS sysctls.
|
||||
setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?;
|
||||
|
||||
// Launch broker client
|
||||
// The storage_broker::connect call needs to happen inside a tokio runtime thread.
|
||||
let broker_client = WALRECEIVER_RUNTIME
|
||||
@@ -567,9 +583,8 @@ fn start_pageserver(
|
||||
|
||||
// Start up the service to handle HTTP mgmt API request. We created the
|
||||
// listener earlier already.
|
||||
let http_endpoint_listener = {
|
||||
let (http_endpoint_listener, https_endpoint_listener) = {
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let router_state = Arc::new(
|
||||
http::routes::State::new(
|
||||
@@ -584,22 +599,51 @@ fn start_pageserver(
|
||||
)
|
||||
.context("Failed to initialize router state")?,
|
||||
);
|
||||
|
||||
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = http_utils::RouterService::new(router).unwrap();
|
||||
let server = hyper0::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown({
|
||||
let cancel = cancel.clone();
|
||||
async move { cancel.clone().cancelled().await }
|
||||
});
|
||||
|
||||
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"http endpoint listener",
|
||||
server,
|
||||
));
|
||||
HttpEndpointListener(CancellableTask { task, cancel })
|
||||
let service =
|
||||
Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?);
|
||||
|
||||
let http_task = {
|
||||
let server =
|
||||
http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?;
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"http endpoint listener",
|
||||
server.serve(cancel.clone()),
|
||||
));
|
||||
HttpEndpointListener(CancellableTask { task, cancel })
|
||||
};
|
||||
|
||||
let https_task = match https_listener {
|
||||
Some(https_listener) => {
|
||||
let certs = load_certs(&conf.ssl_cert_file)?;
|
||||
let key = load_private_key(&conf.ssl_key_file)?;
|
||||
|
||||
let server_config = rustls::ServerConfig::builder()
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(certs, key)?;
|
||||
|
||||
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
|
||||
|
||||
let server =
|
||||
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"https endpoint listener",
|
||||
server.serve(cancel.clone()),
|
||||
));
|
||||
Some(HttpsEndpointListener(CancellableTask { task, cancel }))
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
(http_task, https_task)
|
||||
};
|
||||
|
||||
let consumption_metrics_tasks = {
|
||||
@@ -675,6 +719,7 @@ fn start_pageserver(
|
||||
shutdown_pageserver.cancel();
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
https_endpoint_listener,
|
||||
page_service,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
@@ -689,6 +734,25 @@ fn start_pageserver(
|
||||
})
|
||||
}
|
||||
|
||||
fn load_certs(filename: &Utf8Path) -> std::io::Result<Vec<CertificateDer<'static>>> {
|
||||
let file = std::fs::File::open(filename)?;
|
||||
let mut reader = std::io::BufReader::new(file);
|
||||
|
||||
rustls_pemfile::certs(&mut reader).collect()
|
||||
}
|
||||
|
||||
fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
|
||||
let file = std::fs::File::open(filename)?;
|
||||
let mut reader = std::io::BufReader::new(file);
|
||||
|
||||
let key = rustls_pemfile::private_key(&mut reader)?;
|
||||
|
||||
key.ok_or(anyhow::anyhow!(
|
||||
"no private key found in {}",
|
||||
filename.as_str(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn create_remote_storage_client(
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
|
||||
@@ -53,6 +53,11 @@ pub struct PageServerConf {
|
||||
pub listen_pg_addr: String,
|
||||
/// Example (default): 127.0.0.1:9898
|
||||
pub listen_http_addr: String,
|
||||
/// Example: 127.0.0.1:9899
|
||||
pub listen_https_addr: Option<String>,
|
||||
|
||||
pub ssl_key_file: Utf8PathBuf,
|
||||
pub ssl_cert_file: Utf8PathBuf,
|
||||
|
||||
/// Current availability zone. Used for traffic metrics.
|
||||
pub availability_zone: Option<String>,
|
||||
@@ -194,6 +199,13 @@ pub struct PageServerConf {
|
||||
/// Interpreted protocol feature: if enabled, validate that the logical WAL received from
|
||||
/// safekeepers does not have gaps.
|
||||
pub validate_wal_contiguity: bool,
|
||||
|
||||
/// When set, the previously written to disk heatmap is loaded on tenant attach and used
|
||||
/// to avoid clobbering the heatmap from new, cold, attached locations.
|
||||
pub load_previous_heatmap: bool,
|
||||
|
||||
/// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
|
||||
pub generate_unarchival_heatmap: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -310,6 +322,9 @@ impl PageServerConf {
|
||||
let pageserver_api::config::ConfigToml {
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
ssl_key_file,
|
||||
ssl_cert_file,
|
||||
availability_zone,
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
@@ -358,6 +373,8 @@ impl PageServerConf {
|
||||
get_vectored_concurrent_io,
|
||||
enable_read_path_debugging,
|
||||
validate_wal_contiguity,
|
||||
load_previous_heatmap,
|
||||
generate_unarchival_heatmap,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -366,6 +383,9 @@ impl PageServerConf {
|
||||
// ------------------------------------------------------------
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
ssl_key_file,
|
||||
ssl_cert_file,
|
||||
availability_zone,
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
@@ -447,6 +467,8 @@ impl PageServerConf {
|
||||
no_sync: no_sync.unwrap_or(false),
|
||||
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
|
||||
validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
|
||||
load_previous_heatmap: load_previous_heatmap.unwrap_or(true),
|
||||
generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
@@ -480,7 +502,9 @@ impl PageServerConf {
|
||||
#[cfg(test)]
|
||||
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
|
||||
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
|
||||
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
|
||||
|
||||
let test_id = uuid::Uuid::new_v4();
|
||||
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
|
||||
}
|
||||
|
||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||
@@ -493,6 +517,8 @@ impl PageServerConf {
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
background_task_maximum_delay: Duration::ZERO,
|
||||
load_previous_heatmap: Some(true),
|
||||
generate_unarchival_heatmap: Some(true),
|
||||
..Default::default()
|
||||
};
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
|
||||
|
||||
@@ -89,16 +89,112 @@
|
||||
//! [`RequestContext`] argument. Functions in the middle of the call chain
|
||||
//! only need to pass it on.
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use std::sync::Arc;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use tracing::warn;
|
||||
use utils::{id::TimelineId, shard::TenantShardId};
|
||||
|
||||
use crate::{
|
||||
metrics::{StorageIoSizeMetrics, TimelineMetrics},
|
||||
task_mgr::TaskKind,
|
||||
tenant::Timeline,
|
||||
};
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
#[derive(Debug)]
|
||||
pub struct RequestContext {
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
access_stats_behavior: AccessStatsBehavior,
|
||||
page_content_kind: PageContentKind,
|
||||
read_path_debug: bool,
|
||||
scope: Scope,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum Scope {
|
||||
Global {
|
||||
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
|
||||
},
|
||||
SecondaryTenant {
|
||||
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
|
||||
},
|
||||
SecondaryTimeline {
|
||||
io_size_metrics: crate::metrics::StorageIoSizeMetrics,
|
||||
},
|
||||
Timeline {
|
||||
// We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
|
||||
// context creation contending for the ref counters of the Arc<TimelineMetrics>,
|
||||
// which are shared among all tasks that operate on the timeline, especially
|
||||
// concurrent page_service connections.
|
||||
#[allow(clippy::redundant_allocation)]
|
||||
arc_arc: Arc<Arc<TimelineMetrics>>,
|
||||
},
|
||||
#[cfg(test)]
|
||||
UnitTest {
|
||||
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
|
||||
},
|
||||
}
|
||||
|
||||
static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
|
||||
Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));
|
||||
|
||||
impl Scope {
|
||||
pub(crate) fn new_global() -> Self {
|
||||
Scope::Global {
|
||||
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
|
||||
}
|
||||
}
|
||||
/// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
|
||||
/// of a compaction iteration.
|
||||
pub(crate) fn new_timeline(timeline: &Timeline) -> Self {
|
||||
Scope::Timeline {
|
||||
arc_arc: Arc::new(Arc::clone(&timeline.metrics)),
|
||||
}
|
||||
}
|
||||
pub(crate) fn new_page_service_pagestream(
|
||||
timeline_handle: &crate::tenant::timeline::handle::Handle<
|
||||
crate::page_service::TenantManagerTypes,
|
||||
>,
|
||||
) -> Self {
|
||||
Scope::Timeline {
|
||||
arc_arc: Arc::clone(&timeline_handle.metrics),
|
||||
}
|
||||
}
|
||||
pub(crate) fn new_secondary_timeline(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Self {
|
||||
// TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.
|
||||
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = tenant_shard_id.shard_slug().to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
|
||||
let io_size_metrics =
|
||||
crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
|
||||
Scope::SecondaryTimeline { io_size_metrics }
|
||||
}
|
||||
pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
|
||||
// Before propagating metrics via RequestContext, the labels were inferred from file path.
|
||||
// The only user of VirtualFile at tenant scope is the heatmap download & read.
|
||||
// The inferred labels for the path of the heatmap file on local disk were that of the global metric (*,*,*).
|
||||
// Thus, we do the same here, and extend that for anything secondary-tenant scoped.
|
||||
//
|
||||
// If we want to have (tenant_id, shard_id, '*') labels for secondary tenants in the future,
|
||||
// we will need to think about the metric lifecycle, i.e., remove them during secondary tenant shutdown,
|
||||
// like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
|
||||
// at this point, so, we were able to completely side-step tenant-scoped stuff there).
|
||||
Scope::SecondaryTenant {
|
||||
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
pub(crate) fn new_unit_test() -> Self {
|
||||
Scope::UnitTest {
|
||||
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The kind of access to the page cache.
|
||||
@@ -157,6 +253,7 @@ impl RequestContextBuilder {
|
||||
access_stats_behavior: AccessStatsBehavior::Update,
|
||||
page_content_kind: PageContentKind::Unknown,
|
||||
read_path_debug: false,
|
||||
scope: Scope::new_global(),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -171,10 +268,16 @@ impl RequestContextBuilder {
|
||||
access_stats_behavior: original.access_stats_behavior,
|
||||
page_content_kind: original.page_content_kind,
|
||||
read_path_debug: original.read_path_debug,
|
||||
scope: original.scope.clone(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn task_kind(mut self, k: TaskKind) -> Self {
|
||||
self.inner.task_kind = k;
|
||||
self
|
||||
}
|
||||
|
||||
/// Configure the DownloadBehavior of the context: whether to
|
||||
/// download missing layers, and/or warn on the download.
|
||||
pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
|
||||
@@ -199,6 +302,11 @@ impl RequestContextBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
pub(crate) fn scope(mut self, s: Scope) -> Self {
|
||||
self.inner.scope = s;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> RequestContext {
|
||||
self.inner
|
||||
}
|
||||
@@ -281,7 +389,50 @@ impl RequestContext {
|
||||
}
|
||||
|
||||
fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
Self::new(task_kind, download_behavior)
|
||||
RequestContextBuilder::extend(self)
|
||||
.task_kind(task_kind)
|
||||
.download_behavior(download_behavior)
|
||||
.build()
|
||||
}
|
||||
|
||||
pub fn with_scope_timeline(&self, timeline: &Arc<Timeline>) -> Self {
|
||||
RequestContextBuilder::extend(self)
|
||||
.scope(Scope::new_timeline(timeline))
|
||||
.build()
|
||||
}
|
||||
|
||||
pub(crate) fn with_scope_page_service_pagestream(
|
||||
&self,
|
||||
timeline_handle: &crate::tenant::timeline::handle::Handle<
|
||||
crate::page_service::TenantManagerTypes,
|
||||
>,
|
||||
) -> Self {
|
||||
RequestContextBuilder::extend(self)
|
||||
.scope(Scope::new_page_service_pagestream(timeline_handle))
|
||||
.build()
|
||||
}
|
||||
|
||||
pub fn with_scope_secondary_timeline(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Self {
|
||||
RequestContextBuilder::extend(self)
|
||||
.scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id))
|
||||
.build()
|
||||
}
|
||||
|
||||
pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self {
|
||||
RequestContextBuilder::extend(self)
|
||||
.scope(Scope::new_secondary_tenant(tenant_shard_id))
|
||||
.build()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn with_scope_unit_test(&self) -> Self {
|
||||
RequestContextBuilder::new(TaskKind::UnitTest)
|
||||
.scope(Scope::new_unit_test())
|
||||
.build()
|
||||
}
|
||||
|
||||
pub fn task_kind(&self) -> TaskKind {
|
||||
@@ -303,4 +454,38 @@ impl RequestContext {
|
||||
pub(crate) fn read_path_debug(&self) -> bool {
|
||||
self.read_path_debug
|
||||
}
|
||||
|
||||
pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
|
||||
match &self.scope {
|
||||
Scope::Global { io_size_metrics } => {
|
||||
let is_unit_test = cfg!(test);
|
||||
let is_regress_test_build = cfg!(feature = "testing");
|
||||
if is_unit_test || is_regress_test_build {
|
||||
panic!("all VirtualFile instances are timeline-scoped");
|
||||
} else {
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LIMIT: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
|
||||
let mut guard = LIMIT.lock().unwrap();
|
||||
guard.call2(|rate_limit_stats| {
|
||||
warn!(
|
||||
%rate_limit_stats,
|
||||
backtrace=%std::backtrace::Backtrace::force_capture(),
|
||||
"all VirtualFile instances are timeline-scoped",
|
||||
);
|
||||
});
|
||||
|
||||
io_size_metrics
|
||||
}
|
||||
}
|
||||
Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
|
||||
Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
|
||||
Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
|
||||
#[cfg(test)]
|
||||
Scope::UnitTest { io_size_metrics } => io_size_metrics,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,6 +84,7 @@ impl ControllerUpcallClient {
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn retry_http_forever<R, T>(
|
||||
&self,
|
||||
url: &url::Url,
|
||||
@@ -108,7 +109,7 @@ impl ControllerUpcallClient {
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"calling control plane generation validation API",
|
||||
"storage controller upcall",
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
@@ -125,11 +126,12 @@ impl ControllerUpcallClient {
|
||||
|
||||
impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
#[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
|
||||
async fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
let url = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
.expect("Failed to build re-attach path");
|
||||
@@ -179,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
listen_https_port: None, // TODO: Support https.
|
||||
listen_https_port: m.https_port,
|
||||
availability_zone_id: az_id.expect("Checked above"),
|
||||
})
|
||||
}
|
||||
@@ -205,7 +207,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
register: register.clone(),
|
||||
};
|
||||
|
||||
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
let response: ReAttachResponse = self.retry_http_forever(&url, request).await?;
|
||||
tracing::info!(
|
||||
"Received re-attach response with {} tenants (node {}, register: {:?})",
|
||||
response.tenants.len(),
|
||||
@@ -223,11 +225,12 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
}
|
||||
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
#[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
|
||||
async fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantShardId, Generation)>,
|
||||
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
let url = self
|
||||
.base_url
|
||||
.join("validate")
|
||||
.expect("Failed to build validate path");
|
||||
@@ -257,8 +260,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
return Err(RetryForeverError::ShuttingDown);
|
||||
}
|
||||
|
||||
let response: ValidateResponse =
|
||||
self.retry_http_forever(&re_attach_path, request).await?;
|
||||
let response: ValidateResponse = self.retry_http_forever(&url, request).await?;
|
||||
for rt in response.tenants {
|
||||
result.insert(rt.id, rt.valid);
|
||||
}
|
||||
|
||||
@@ -842,6 +842,12 @@ paths:
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
- name: recurse
|
||||
description: When set, will recurse with the downloads into ancestor timelines
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
post:
|
||||
description: |
|
||||
Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
|
||||
|
||||
@@ -37,7 +37,8 @@ use pageserver_api::models::{
|
||||
TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
|
||||
TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
|
||||
TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
|
||||
TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
|
||||
TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, TenantShardId};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
|
||||
@@ -54,6 +55,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context;
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
@@ -63,6 +65,7 @@ use crate::tenant::mgr::{
|
||||
GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::index::GcCompactionState;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
download_index_part, list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
@@ -481,6 +484,7 @@ async fn build_timeline_info_common(
|
||||
|
||||
state,
|
||||
is_archived: Some(is_archived),
|
||||
rel_size_migration: Some(timeline.get_rel_size_v2_status()),
|
||||
|
||||
walreceiver_status,
|
||||
};
|
||||
@@ -857,6 +861,75 @@ async fn timeline_archival_config_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
|
||||
/// measure only.
|
||||
///
|
||||
/// Some examples of safe patches:
|
||||
/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
|
||||
/// - Force set the index part to use reldir v2 (migrating/migrated).
|
||||
///
|
||||
/// Some examples of unsafe patches:
|
||||
/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
|
||||
/// errors.
|
||||
/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
|
||||
async fn timeline_patch_index_part_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, None)?; // require global permission for this request
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
if let Some(rel_size_migration) = request_data.rel_size_migration {
|
||||
timeline
|
||||
.update_rel_size_v2_status(rel_size_migration)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
if let Some(gc_compaction_last_completed_lsn) =
|
||||
request_data.gc_compaction_last_completed_lsn
|
||||
{
|
||||
timeline
|
||||
.update_gc_compaction_state(GcCompactionState {
|
||||
last_completed_lsn: gc_compaction_last_completed_lsn,
|
||||
})
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
|
||||
{
|
||||
let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
|
||||
guard.store_and_unlock(applied_gc_cutoff_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
if request_data.force_index_update {
|
||||
timeline
|
||||
.remote_client
|
||||
.force_schedule_index_upload()
|
||||
.context("force schedule index upload")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_patch_index_part",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
%timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -881,12 +954,13 @@ async fn timeline_detail_handler(
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, false)?;
|
||||
let ctx = &ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("get local timeline info")
|
||||
@@ -927,11 +1001,11 @@ async fn get_lsn_by_timestamp_handler(
|
||||
|
||||
let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
|
||||
.await?;
|
||||
@@ -1000,10 +1074,11 @@ async fn get_timestamp_of_lsn_handler(
|
||||
.with_context(|| format!("Invalid LSN: {lsn_str:?}"))
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
|
||||
|
||||
match result {
|
||||
@@ -1358,7 +1433,8 @@ async fn timeline_layer_scan_disposable_keys(
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
|
||||
@@ -1368,7 +1444,7 @@ async fn timeline_layer_scan_disposable_keys(
|
||||
};
|
||||
|
||||
let resident_layer = layer
|
||||
.download_and_keep_resident()
|
||||
.download_and_keep_resident(&ctx)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
tenant::storage_layer::layer::DownloadError::TimelineShutdown
|
||||
@@ -1436,6 +1512,7 @@ async fn timeline_download_heatmap_layers_handler(
|
||||
|
||||
let desired_concurrency =
|
||||
parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
|
||||
let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false);
|
||||
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
@@ -1443,6 +1520,8 @@ async fn timeline_download_heatmap_layers_handler(
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
|
||||
let max_concurrency = get_config(&request)
|
||||
.remote_storage_config
|
||||
@@ -1451,7 +1530,7 @@ async fn timeline_download_heatmap_layers_handler(
|
||||
.unwrap_or(DEFAULT_MAX_CONCURRENCY);
|
||||
let concurrency = std::cmp::min(max_concurrency, desired_concurrency);
|
||||
|
||||
timeline.start_heatmap_layers_download(concurrency).await?;
|
||||
timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
@@ -1490,8 +1569,10 @@ async fn layer_download_handler(
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
let downloaded = timeline
|
||||
.download_layer(&layer_name)
|
||||
.download_layer(&layer_name, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
tenant::storage_layer::layer::DownloadError::TimelineShutdown
|
||||
@@ -2225,8 +2306,8 @@ async fn timeline_compact_handler(
|
||||
.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
|
||||
if scheduled {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
@@ -2333,8 +2414,8 @@ async fn timeline_checkpoint_handler(
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
|
||||
if wait_until_flushed {
|
||||
timeline.freeze_and_flush().await
|
||||
} else {
|
||||
@@ -2389,7 +2470,9 @@ async fn timeline_download_remote_layers_handler_post(
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
match timeline.spawn_download_all_remote_layers(body).await {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
match timeline.spawn_download_all_remote_layers(body, &ctx).await {
|
||||
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
||||
Err(st) => json_response(StatusCode::CONFLICT, st),
|
||||
}
|
||||
@@ -2471,6 +2554,7 @@ async fn timeline_detach_ancestor_handler(
|
||||
tracing::info!("all timeline upload queues are drained");
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
let ctx = &ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let progress = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
@@ -2577,8 +2661,9 @@ async fn getpage_at_lsn_handler_inner(
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
// Enable read path debugging
|
||||
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true)
|
||||
.scope(context::Scope::new_timeline(&timeline)).build();
|
||||
|
||||
// Use last_record_lsn if no lsn is provided
|
||||
let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
@@ -2612,8 +2697,8 @@ async fn timeline_collect_keyspace(
|
||||
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
|
||||
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
let (dense_ks, sparse_ks) = timeline
|
||||
.collect_keyspace(at_lsn, &ctx)
|
||||
@@ -3250,7 +3335,7 @@ async fn put_tenant_timeline_import_basebackup(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant
|
||||
let (timeline, timeline_ctx) = tenant
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||
.map_err(ApiError::InternalServerError)
|
||||
.await?;
|
||||
@@ -3269,7 +3354,13 @@ async fn put_tenant_timeline_import_basebackup(
|
||||
info!("importing basebackup");
|
||||
|
||||
timeline
|
||||
.import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
|
||||
.import_basebackup_from_tar(
|
||||
tenant.clone(),
|
||||
&mut body,
|
||||
base_lsn,
|
||||
broker_client,
|
||||
&timeline_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -3309,6 +3400,7 @@ async fn put_tenant_timeline_import_wal(
|
||||
let state = get_state(&request);
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
|
||||
let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build();
|
||||
|
||||
let mut body = StreamReader::new(request.into_body().map(|res| {
|
||||
res.map_err(|error| {
|
||||
@@ -3625,6 +3717,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
||||
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
|
||||
|r| api_handler(r, timeline_patch_index_part_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|
||||
|r| api_handler(r, lsn_lease_handler),
|
||||
|
||||
@@ -64,6 +64,7 @@ pub struct CancellableTask {
|
||||
pub cancel: CancellationToken,
|
||||
}
|
||||
pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct HttpsEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
impl CancellableTask {
|
||||
@@ -77,6 +78,7 @@ impl CancellableTask {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn shutdown_pageserver(
|
||||
http_listener: HttpEndpointListener,
|
||||
https_listener: Option<HttpsEndpointListener>,
|
||||
page_service: page_service::Listener,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
@@ -213,6 +215,15 @@ pub async fn shutdown_pageserver(
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Some(https_listener) = https_listener {
|
||||
timed(
|
||||
https_listener.0.shutdown(),
|
||||
"shutdown https",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
|
||||
@@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_per_read_batch_global",
|
||||
"Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
|
||||
vec![
|
||||
1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
|
||||
],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_per_read_amortized_global",
|
||||
"Layers visited to serve a single read (read amplification). Amortized across a batch: \
|
||||
all visited layers are divided by number of reads.",
|
||||
vec![
|
||||
1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
|
||||
],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
|
||||
// We expect this to be low because of Postgres checkpoints. Let's see if that holds.
|
||||
register_histogram!(
|
||||
@@ -1204,11 +1227,24 @@ impl StorageIoTime {
|
||||
|
||||
pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
|
||||
|
||||
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
|
||||
#[derive(Clone, Copy)]
|
||||
#[repr(usize)]
|
||||
enum StorageIoSizeOperation {
|
||||
Read,
|
||||
Write,
|
||||
}
|
||||
|
||||
impl StorageIoSizeOperation {
|
||||
const VARIANTS: &'static [&'static str] = &["read", "write"];
|
||||
|
||||
fn as_str(&self) -> &'static str {
|
||||
Self::VARIANTS[*self as usize]
|
||||
}
|
||||
}
|
||||
|
||||
// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
|
||||
pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"Total amount of bytes read/written in IO operations",
|
||||
&["operation", "tenant_id", "shard_id", "timeline_id"]
|
||||
@@ -1216,6 +1252,34 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct StorageIoSizeMetrics {
|
||||
pub read: UIntGauge,
|
||||
pub write: UIntGauge,
|
||||
}
|
||||
|
||||
impl StorageIoSizeMetrics {
|
||||
pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
|
||||
let read = STORAGE_IO_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
StorageIoSizeOperation::Read.as_str(),
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
])
|
||||
.unwrap();
|
||||
let write = STORAGE_IO_SIZE
|
||||
.get_metric_with_label_values(&[
|
||||
StorageIoSizeOperation::Write.as_str(),
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
])
|
||||
.unwrap();
|
||||
Self { read, write }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub(crate) mod virtual_file_descriptor_cache {
|
||||
use super::*;
|
||||
@@ -2798,6 +2862,7 @@ pub(crate) struct TimelineMetrics {
|
||||
/// Number of valid LSN leases.
|
||||
pub valid_lsn_lease_count_gauge: UIntGauge,
|
||||
pub wal_records_received: IntCounter,
|
||||
pub storage_io_size: StorageIoSizeMetrics,
|
||||
shutdown: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
@@ -2933,6 +2998,8 @@ impl TimelineMetrics {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
shard_id,
|
||||
@@ -2962,6 +3029,7 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
storage_io_size,
|
||||
valid_lsn_lease_count_gauge,
|
||||
wal_records_received,
|
||||
shutdown: std::sync::atomic::AtomicBool::default(),
|
||||
@@ -3152,7 +3220,7 @@ impl TimelineMetrics {
|
||||
]);
|
||||
}
|
||||
|
||||
for op in STORAGE_IO_SIZE_OPERATIONS {
|
||||
for op in StorageIoSizeOperation::VARIANTS {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
|
||||
@@ -4074,6 +4142,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
|
||||
// histograms
|
||||
[
|
||||
&LAYERS_PER_READ_GLOBAL,
|
||||
&LAYERS_PER_READ_BATCH_GLOBAL,
|
||||
&LAYERS_PER_READ_AMORTIZED_GLOBAL,
|
||||
&DELTAS_PER_READ_GLOBAL,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
|
||||
@@ -56,6 +56,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
|
||||
TimelineMetrics,
|
||||
};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::{
|
||||
@@ -392,10 +393,6 @@ impl TimelineHandles {
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
timeline::handle::GetError::TenantManager(e) => e,
|
||||
timeline::handle::GetError::TimelineGateClosed => {
|
||||
trace!("timeline gate closed");
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
}
|
||||
timeline::handle::GetError::PerTimelineStateShutDown => {
|
||||
trace!("per-timeline state shut down");
|
||||
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
|
||||
@@ -422,24 +419,36 @@ pub(crate) struct TenantManagerTypes;
|
||||
impl timeline::handle::Types for TenantManagerTypes {
|
||||
type TenantManagerError = GetActiveTimelineError;
|
||||
type TenantManager = TenantManagerWrapper;
|
||||
type Timeline = Arc<Timeline>;
|
||||
type Timeline = TenantManagerCacheItem;
|
||||
}
|
||||
|
||||
impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate {
|
||||
&self.gate
|
||||
}
|
||||
pub(crate) struct TenantManagerCacheItem {
|
||||
pub(crate) timeline: Arc<Timeline>,
|
||||
// allow() for cheap propagation through RequestContext inside a task
|
||||
#[allow(clippy::redundant_allocation)]
|
||||
pub(crate) metrics: Arc<Arc<TimelineMetrics>>,
|
||||
#[allow(dead_code)] // we store it to keep the gate open
|
||||
pub(crate) gate_guard: GateGuard,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TenantManagerCacheItem {
|
||||
type Target = Arc<Timeline>;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
}
|
||||
}
|
||||
|
||||
impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
|
||||
fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
|
||||
Timeline::shard_timeline_id(self)
|
||||
Timeline::shard_timeline_id(&self.timeline)
|
||||
}
|
||||
|
||||
fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
|
||||
&self.handles
|
||||
&self.timeline.handles
|
||||
}
|
||||
|
||||
fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
|
||||
Timeline::get_shard_identity(self)
|
||||
Timeline::get_shard_identity(&self.timeline)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -448,7 +457,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
|
||||
let tenant_id = self.tenant_id.get().expect("we set this in get()");
|
||||
let timeout = ACTIVE_TENANT_TIMEOUT;
|
||||
let wait_start = Instant::now();
|
||||
@@ -491,7 +500,23 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
|
||||
let timeline = tenant_shard
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(GetActiveTimelineError::Timeline)?;
|
||||
Ok(timeline)
|
||||
|
||||
let gate_guard = match timeline.gate.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetActiveTimelineError::Timeline(
|
||||
GetTimelineError::ShuttingDown,
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let metrics = Arc::new(Arc::clone(&timeline.metrics));
|
||||
|
||||
Ok(TenantManagerCacheItem {
|
||||
timeline,
|
||||
metrics,
|
||||
gate_guard,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1220,6 +1245,14 @@ impl PageServerHandler {
|
||||
),
|
||||
QueryError,
|
||||
> {
|
||||
macro_rules! upgrade_handle_and_set_context {
|
||||
($shard:ident) => {{
|
||||
let weak_handle = &$shard;
|
||||
let handle = weak_handle.upgrade()?;
|
||||
let ctx = ctx.with_scope_page_service_pagestream(&handle);
|
||||
(handle, ctx)
|
||||
}};
|
||||
}
|
||||
Ok(match batch {
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
@@ -1228,9 +1261,10 @@ impl PageServerHandler {
|
||||
req,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::exists");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
(
|
||||
vec![
|
||||
self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
|
||||
self.handle_get_rel_exists_request(&shard, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1246,9 +1280,10 @@ impl PageServerHandler {
|
||||
req,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
(
|
||||
vec![
|
||||
self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
|
||||
self.handle_get_nblocks_request(&shard, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1264,17 +1299,18 @@ impl PageServerHandler {
|
||||
pages,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::getpage");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
(
|
||||
{
|
||||
let npages = pages.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_get_page_at_lsn_request_batched(
|
||||
&*shard.upgrade()?,
|
||||
&shard,
|
||||
effective_request_lsn,
|
||||
pages,
|
||||
io_concurrency,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
@@ -1291,9 +1327,10 @@ impl PageServerHandler {
|
||||
req,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
(
|
||||
vec![
|
||||
self.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
|
||||
self.handle_db_size_request(&shard, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1309,9 +1346,10 @@ impl PageServerHandler {
|
||||
req,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
(
|
||||
vec![
|
||||
self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
|
||||
self.handle_get_slru_segment_request(&shard, &req, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
@@ -1327,12 +1365,13 @@ impl PageServerHandler {
|
||||
requests,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::test");
|
||||
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
|
||||
(
|
||||
{
|
||||
let npages = requests.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
|
||||
.handle_test_request_batch(&shard, requests, &ctx)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
@@ -2095,6 +2134,7 @@ impl PageServerHandler {
|
||||
// TODO: passthrough the error site to the final error message?
|
||||
BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
|
||||
BasebackupError::Server(e) => QueryError::Other(e),
|
||||
BasebackupError::Shutdown => QueryError::Shutdown,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2107,6 +2147,7 @@ impl PageServerHandler {
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
|
||||
if timeline.is_archived() == Some(true) {
|
||||
tracing::info!(
|
||||
@@ -2124,7 +2165,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::PageService,
|
||||
crate::tenant::timeline::WaitLsnTimeout::Default,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
timeline
|
||||
@@ -2150,7 +2191,7 @@ impl PageServerHandler {
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
@@ -2173,7 +2214,7 @@ impl PageServerHandler {
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
@@ -2190,7 +2231,7 @@ impl PageServerHandler {
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(map_basebackup_error)?;
|
||||
|
||||
@@ -21,6 +21,7 @@ use pageserver_api::key::{
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
@@ -492,7 +493,9 @@ impl Timeline {
|
||||
// Otherwise, read the old reldir keyspace.
|
||||
// TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
|
||||
|
||||
if self.get_rel_size_v2_enabled() {
|
||||
if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
|
||||
self.get_rel_size_v2_status()
|
||||
{
|
||||
// fetch directory listing (new)
|
||||
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
|
||||
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
|
||||
@@ -544,7 +547,7 @@ impl Timeline {
|
||||
forknum: *forknum,
|
||||
}));
|
||||
|
||||
if !self.get_rel_size_v2_enabled() {
|
||||
if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
|
||||
return Ok(rels_v1);
|
||||
}
|
||||
|
||||
@@ -599,28 +602,36 @@ impl Timeline {
|
||||
let n_blocks = self
|
||||
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
|
||||
for blkno in 0..n_blocks {
|
||||
let block = self
|
||||
.get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
|
||||
.await?;
|
||||
segment.extend_from_slice(&block[..BLCKSZ as usize]);
|
||||
}
|
||||
Ok(segment.freeze())
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
pub(crate) async fn get_slru_page_at_lsn(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
assert!(self.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
self.get(key, lsn, ctx).await
|
||||
let keyspace = KeySpace::single(
|
||||
slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
|
||||
);
|
||||
|
||||
let batches = keyspace.partition(
|
||||
self.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
);
|
||||
|
||||
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
|
||||
for batch in batches.parts {
|
||||
let blocks = self
|
||||
.get_vectored(batch, lsn, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
for (_key, block) in blocks {
|
||||
let block = block?;
|
||||
segment.extend_from_slice(&block[..BLCKSZ as usize]);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(segment.freeze())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
@@ -829,19 +840,41 @@ impl Timeline {
|
||||
let nblocks = self
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
|
||||
.await?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page = self
|
||||
.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
|
||||
|
||||
let keyspace = KeySpace::single(
|
||||
slru_block_to_key(SlruKind::Clog, segno, 0)
|
||||
..slru_block_to_key(SlruKind::Clog, segno, nblocks),
|
||||
);
|
||||
|
||||
let batches = keyspace.partition(
|
||||
self.get_shard_identity(),
|
||||
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
);
|
||||
|
||||
for batch in batches.parts.into_iter().rev() {
|
||||
let blocks = self
|
||||
.get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
|
||||
.await?;
|
||||
|
||||
if clog_page.len() == BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
|
||||
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
|
||||
for (_key, clog_page) in blocks.into_iter().rev() {
|
||||
let clog_page = clog_page?;
|
||||
|
||||
match f(timestamp) {
|
||||
ControlFlow::Break(b) => return Ok(b),
|
||||
ControlFlow::Continue(()) => (),
|
||||
if clog_page.len() == BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
|
||||
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
|
||||
|
||||
match f(timestamp) {
|
||||
ControlFlow::Break(b) => return Ok(b),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1052,6 +1085,8 @@ impl Timeline {
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
|
||||
fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) });
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
@@ -1718,6 +1753,35 @@ impl DatadirModification<'_> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
|
||||
/// we enable it, we also need to persist it in `index_part.json`.
|
||||
pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
|
||||
let status = self.tline.get_rel_size_v2_status();
|
||||
let config = self.tline.get_rel_size_v2_enabled();
|
||||
match (config, status) {
|
||||
(false, RelSizeMigration::Legacy) => {
|
||||
// tenant config didn't enable it and we didn't write any reldir_v2 key yet
|
||||
Ok(false)
|
||||
}
|
||||
(false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
|
||||
// index_part already persisted that the timeline has enabled rel_size_v2
|
||||
Ok(true)
|
||||
}
|
||||
(true, RelSizeMigration::Legacy) => {
|
||||
// The first time we enable it, we need to persist it in `index_part.json`
|
||||
self.tline
|
||||
.update_rel_size_v2_status(RelSizeMigration::Migrating)?;
|
||||
tracing::info!("enabled rel_size_v2");
|
||||
Ok(true)
|
||||
}
|
||||
(true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
|
||||
// index_part already persisted that the timeline has enabled rel_size_v2
|
||||
// and we don't need to do anything
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Store a relmapper file (pg_filenode.map) in the repository
|
||||
pub async fn put_relmap_file(
|
||||
&mut self,
|
||||
@@ -1726,6 +1790,8 @@ impl DatadirModification<'_> {
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
let mut dbdir = DbDirectory::des(&buf)?;
|
||||
@@ -1746,7 +1812,7 @@ impl DatadirModification<'_> {
|
||||
})?;
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
|
||||
if self.tline.get_rel_size_v2_enabled() {
|
||||
if v2_enabled {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
|
||||
}
|
||||
@@ -1898,12 +1964,12 @@ impl DatadirModification<'_> {
|
||||
.context("deserialize db")?
|
||||
};
|
||||
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
|
||||
if self.tline.get_rel_size_v2_enabled() {
|
||||
if v2_enabled {
|
||||
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
let sparse_rel_dir_key =
|
||||
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
|
||||
// check if the rel_dir_key exists in v2
|
||||
@@ -1938,6 +2004,10 @@ impl DatadirModification<'_> {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
|
||||
} else {
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
|
||||
@@ -1951,6 +2021,7 @@ impl DatadirModification<'_> {
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let buf = nblocks.to_le_bytes();
|
||||
@@ -2029,6 +2100,7 @@ impl DatadirModification<'_> {
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2041,7 +2113,7 @@ impl DatadirModification<'_> {
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
|
||||
dirty = true;
|
||||
true
|
||||
} else if self.tline.get_rel_size_v2_enabled() {
|
||||
} else if v2_enabled {
|
||||
// The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
|
||||
// Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
|
||||
// logic).
|
||||
@@ -2072,7 +2144,7 @@ impl DatadirModification<'_> {
|
||||
// Remove entry from relation size cache
|
||||
self.tline.remove_cached_rel_size(&rel_tag);
|
||||
|
||||
// Delete size entry, as well as all blocks
|
||||
// Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
|
||||
self.delete(rel_key_range(rel_tag));
|
||||
}
|
||||
}
|
||||
@@ -2686,7 +2758,7 @@ mod tests {
|
||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
let (tline, ctx) = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
@@ -31,8 +31,8 @@ use futures::StreamExt;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use itertools::Itertools as _;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models;
|
||||
pub use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{self, RelSizeMigration};
|
||||
use pageserver_api::models::{
|
||||
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
|
||||
WalRedoManagerStatus,
|
||||
@@ -77,6 +77,8 @@ use self::timeline::{
|
||||
EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
|
||||
};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context;
|
||||
use crate::context::RequestContextBuilder;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
|
||||
use crate::l0_flush::L0FlushGlobalState;
|
||||
@@ -1114,7 +1116,7 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
let timeline = self.create_timeline_struct(
|
||||
let (timeline, timeline_ctx) = self.create_timeline_struct(
|
||||
timeline_id,
|
||||
&metadata,
|
||||
previous_heatmap,
|
||||
@@ -1123,6 +1125,8 @@ impl Tenant {
|
||||
CreateTimelineCause::Load,
|
||||
idempotency.clone(),
|
||||
index_part.gc_compaction.clone(),
|
||||
index_part.rel_size_migration.clone(),
|
||||
ctx,
|
||||
)?;
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
anyhow::ensure!(
|
||||
@@ -1149,16 +1153,19 @@ impl Tenant {
|
||||
// a previous heatmap which contains all visible layers in the layer map.
|
||||
// This previous heatmap will be used whenever a fresh heatmap is generated
|
||||
// for the timeline.
|
||||
if matches!(cause, LoadTimelineCause::Unoffload) {
|
||||
if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
|
||||
let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
|
||||
while let Some((tline, end_lsn)) = tline_ending_at {
|
||||
let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
|
||||
if !tline.is_previous_heatmap_active() {
|
||||
// Another unearchived timeline might have generated a heatmap for this ancestor.
|
||||
// If the current branch point greater than the previous one use the the heatmap
|
||||
// we just generated - it should include more layers.
|
||||
if !tline.should_keep_previous_heatmap(end_lsn) {
|
||||
tline
|
||||
.previous_heatmap
|
||||
.store(Some(Arc::new(unarchival_heatmap)));
|
||||
} else {
|
||||
tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.")
|
||||
tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.")
|
||||
}
|
||||
|
||||
match tline.ancestor_timeline() {
|
||||
@@ -1253,7 +1260,7 @@ impl Tenant {
|
||||
match activate {
|
||||
ActivateTimelineArgs::Yes { broker_client } => {
|
||||
info!("activating timeline after reload from pgdata import task");
|
||||
timeline.activate(self.clone(), broker_client, None, ctx);
|
||||
timeline.activate(self.clone(), broker_client, None, &timeline_ctx);
|
||||
}
|
||||
ActivateTimelineArgs::No => (),
|
||||
}
|
||||
@@ -1578,6 +1585,10 @@ impl Tenant {
|
||||
}
|
||||
|
||||
async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
|
||||
if !self.conf.load_previous_heatmap {
|
||||
return None;
|
||||
}
|
||||
|
||||
let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
|
||||
match tokio::fs::read_to_string(on_disk_heatmap_path).await {
|
||||
Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
|
||||
@@ -1757,6 +1768,7 @@ impl Tenant {
|
||||
import_pgdata,
|
||||
ActivateTimelineArgs::No,
|
||||
guard,
|
||||
ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -1774,6 +1786,7 @@ impl Tenant {
|
||||
timeline_id,
|
||||
&index_part.metadata,
|
||||
remote_timeline_client,
|
||||
ctx,
|
||||
)
|
||||
.instrument(tracing::info_span!("timeline_delete", %timeline_id))
|
||||
.await
|
||||
@@ -1939,6 +1952,7 @@ impl Tenant {
|
||||
hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
|
||||
heatmap: h,
|
||||
read_at: hs.1,
|
||||
end_lsn: None,
|
||||
})
|
||||
});
|
||||
part_downloads.spawn(
|
||||
@@ -2210,7 +2224,7 @@ impl Tenant {
|
||||
self.clone(),
|
||||
broker_client.clone(),
|
||||
background_jobs_can_start,
|
||||
&ctx,
|
||||
&ctx.with_scope_timeline(&timeline),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2407,8 +2421,8 @@ impl Tenant {
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(UninitializedTimeline, RequestContext)> {
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot create empty timelines on inactive tenant"
|
||||
@@ -2442,6 +2456,8 @@ impl Tenant {
|
||||
create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -2459,7 +2475,7 @@ impl Tenant {
|
||||
pg_version: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let uninit_tl = self
|
||||
let (uninit_tl, ctx) = self
|
||||
.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
|
||||
.await?;
|
||||
let tline = uninit_tl.raw_timeline().expect("we just created it");
|
||||
@@ -2471,7 +2487,7 @@ impl Tenant {
|
||||
.init_empty_test_timeline()
|
||||
.context("init_empty_test_timeline")?;
|
||||
modification
|
||||
.commit(ctx)
|
||||
.commit(&ctx)
|
||||
.await
|
||||
.context("commit init_empty_test_timeline modification")?;
|
||||
|
||||
@@ -2497,6 +2513,7 @@ impl Tenant {
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
ctx: &RequestContext,
|
||||
in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
|
||||
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
|
||||
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
|
||||
end_lsn: Lsn,
|
||||
@@ -2518,6 +2535,11 @@ impl Tenant {
|
||||
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
for in_memory in in_memory_layer_desc {
|
||||
tline
|
||||
.force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
|
||||
.await?;
|
||||
}
|
||||
let layer_names = tline
|
||||
.layers
|
||||
.read()
|
||||
@@ -2683,7 +2705,12 @@ impl Tenant {
|
||||
// doing stuff before the IndexPart is durable in S3, which is done by the previous section.
|
||||
let activated_timeline = match result {
|
||||
CreateTimelineResult::Created(timeline) => {
|
||||
timeline.activate(self.clone(), broker_client, None, ctx);
|
||||
timeline.activate(
|
||||
self.clone(),
|
||||
broker_client,
|
||||
None,
|
||||
&ctx.with_scope_timeline(&timeline),
|
||||
);
|
||||
timeline
|
||||
}
|
||||
CreateTimelineResult::Idempotent(timeline) => {
|
||||
@@ -2745,10 +2772,9 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
let mut uninit_timeline = {
|
||||
let (mut uninit_timeline, timeline_ctx) = {
|
||||
let this = &self;
|
||||
let initdb_lsn = Lsn(0);
|
||||
let _ctx = ctx;
|
||||
async move {
|
||||
let new_metadata = TimelineMetadata::new(
|
||||
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
||||
@@ -2767,6 +2793,8 @@ impl Tenant {
|
||||
timeline_create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -2796,6 +2824,7 @@ impl Tenant {
|
||||
index_part,
|
||||
activate,
|
||||
timeline_create_guard,
|
||||
timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
|
||||
));
|
||||
|
||||
// NB: the timeline doesn't exist in self.timelines at this point
|
||||
@@ -2809,6 +2838,7 @@ impl Tenant {
|
||||
index_part: import_pgdata::index_part_format::Root,
|
||||
activate: ActivateTimelineArgs,
|
||||
timeline_create_guard: TimelineCreateGuard,
|
||||
ctx: RequestContext,
|
||||
) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
info!("starting");
|
||||
@@ -2820,6 +2850,7 @@ impl Tenant {
|
||||
index_part,
|
||||
activate,
|
||||
timeline_create_guard,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
if let Err(err) = &res {
|
||||
@@ -2835,9 +2866,8 @@ impl Tenant {
|
||||
index_part: import_pgdata::index_part_format::Root,
|
||||
activate: ActivateTimelineArgs,
|
||||
timeline_create_guard: TimelineCreateGuard,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn);
|
||||
|
||||
info!("importing pgdata");
|
||||
import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
|
||||
.await
|
||||
@@ -3046,6 +3076,7 @@ impl Tenant {
|
||||
|
||||
let mut has_pending_l0 = false;
|
||||
for timeline in compact_l0 {
|
||||
let ctx = &ctx.with_scope_timeline(&timeline);
|
||||
let outcome = timeline
|
||||
.compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
|
||||
.instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
|
||||
@@ -3079,6 +3110,7 @@ impl Tenant {
|
||||
if !timeline.is_active() {
|
||||
continue;
|
||||
}
|
||||
let ctx = &ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let mut outcome = timeline
|
||||
.compact(cancel, EnumSet::default(), ctx)
|
||||
@@ -3141,11 +3173,13 @@ impl Tenant {
|
||||
/// Trips the compaction circuit breaker if appropriate.
|
||||
pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
|
||||
match err {
|
||||
err if err.is_cancel() => {}
|
||||
CompactionError::ShuttingDown => (),
|
||||
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
||||
// shouldn't block compaction.
|
||||
CompactionError::Offload(_) => {}
|
||||
CompactionError::CollectKeySpaceError(err) => {
|
||||
// CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
@@ -3302,7 +3336,7 @@ impl Tenant {
|
||||
self.clone(),
|
||||
broker_client.clone(),
|
||||
background_jobs_can_start,
|
||||
ctx,
|
||||
&ctx.with_scope_timeline(timeline),
|
||||
);
|
||||
activated_timelines += 1;
|
||||
}
|
||||
@@ -4116,7 +4150,9 @@ impl Tenant {
|
||||
cause: CreateTimelineCause,
|
||||
create_idempotency: CreateTimelineIdempotency,
|
||||
gc_compaction_state: Option<GcCompactionState>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(Arc<Timeline>, RequestContext)> {
|
||||
let state = match cause {
|
||||
CreateTimelineCause::Load => {
|
||||
let ancestor_id = new_metadata.ancestor_timeline();
|
||||
@@ -4148,10 +4184,15 @@ impl Tenant {
|
||||
self.attach_wal_lag_cooldown.clone(),
|
||||
create_idempotency,
|
||||
gc_compaction_state,
|
||||
rel_size_v2_status,
|
||||
self.cancel.child_token(),
|
||||
);
|
||||
|
||||
Ok(timeline)
|
||||
let timeline_ctx = RequestContextBuilder::extend(ctx)
|
||||
.scope(context::Scope::new_timeline(&timeline))
|
||||
.build();
|
||||
|
||||
Ok((timeline, timeline_ctx))
|
||||
}
|
||||
|
||||
/// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
|
||||
@@ -4567,6 +4608,7 @@ impl Tenant {
|
||||
// Ensures all timelines use the same start time when computing the time cutoff.
|
||||
let now_ts_for_pitr_calc = SystemTime::now();
|
||||
for timeline in timelines.iter() {
|
||||
let ctx = &ctx.with_scope_timeline(timeline);
|
||||
let cutoff = timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(horizon)
|
||||
@@ -4740,7 +4782,7 @@ impl Tenant {
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
_ctx: &RequestContext,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CreateTimelineResult, CreateTimelineError> {
|
||||
let src_id = src_timeline.timeline_id;
|
||||
|
||||
@@ -4843,13 +4885,15 @@ impl Tenant {
|
||||
src_timeline.pg_version,
|
||||
);
|
||||
|
||||
let uninitialized_timeline = self
|
||||
let (uninitialized_timeline, _timeline_ctx) = self
|
||||
.prepare_new_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
timeline_create_guard,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
Some(src_timeline.get_rel_size_v2_status()),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -5116,13 +5160,15 @@ impl Tenant {
|
||||
pgdata_lsn,
|
||||
pg_version,
|
||||
);
|
||||
let mut raw_timeline = self
|
||||
let (mut raw_timeline, timeline_ctx) = self
|
||||
.prepare_new_timeline(
|
||||
timeline_id,
|
||||
&new_metadata,
|
||||
timeline_create_guard,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -5133,7 +5179,7 @@ impl Tenant {
|
||||
&unfinished_timeline,
|
||||
&pgdata_path,
|
||||
pgdata_lsn,
|
||||
ctx,
|
||||
&timeline_ctx,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -5194,6 +5240,7 @@ impl Tenant {
|
||||
/// An empty layer map is initialized, and new data and WAL can be imported starting
|
||||
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
|
||||
/// `finish_creation` to insert the Timeline into the timelines map.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn prepare_new_timeline<'a>(
|
||||
&'a self,
|
||||
new_timeline_id: TimelineId,
|
||||
@@ -5201,15 +5248,17 @@ impl Tenant {
|
||||
create_guard: TimelineCreateGuard,
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
) -> anyhow::Result<UninitializedTimeline<'a>> {
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(UninitializedTimeline<'a>, RequestContext)> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let resources = self.build_timeline_resources(new_timeline_id);
|
||||
resources
|
||||
.remote_client
|
||||
.init_upload_queue_for_empty_remote(new_metadata)?;
|
||||
.init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;
|
||||
|
||||
let timeline_struct = self
|
||||
let (timeline_struct, timeline_ctx) = self
|
||||
.create_timeline_struct(
|
||||
new_timeline_id,
|
||||
new_metadata,
|
||||
@@ -5219,6 +5268,8 @@ impl Tenant {
|
||||
CreateTimelineCause::Load,
|
||||
create_guard.idempotency.clone(),
|
||||
None,
|
||||
rel_size_v2_status,
|
||||
ctx,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
|
||||
@@ -5239,10 +5290,13 @@ impl Tenant {
|
||||
"Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}"
|
||||
);
|
||||
|
||||
Ok(UninitializedTimeline::new(
|
||||
self,
|
||||
new_timeline_id,
|
||||
Some((timeline_struct, create_guard)),
|
||||
Ok((
|
||||
UninitializedTimeline::new(
|
||||
self,
|
||||
new_timeline_id,
|
||||
Some((timeline_struct, create_guard)),
|
||||
),
|
||||
timeline_ctx,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -5777,7 +5831,8 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
|
||||
.with_scope_unit_test();
|
||||
(
|
||||
self.do_try_load(&ctx)
|
||||
.await
|
||||
@@ -5907,6 +5962,8 @@ mod tests {
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::GcInfo;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::InMemoryLayerTestDesc;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
use timeline::{CompactOptions, DeltaLayerTestDesc};
|
||||
use utils::id::TenantId;
|
||||
@@ -6798,7 +6855,7 @@ mod tests {
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
let tline = tenant
|
||||
let (tline, ctx) = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
@@ -7420,7 +7477,7 @@ mod tests {
|
||||
.await;
|
||||
|
||||
let initdb_lsn = Lsn(0x20);
|
||||
let utline = tenant
|
||||
let (utline, ctx) = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = utline.raw_timeline().unwrap();
|
||||
@@ -7487,7 +7544,7 @@ mod tests {
|
||||
let harness = TenantHarness::create(name).await?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
let (tline, _ctx) = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||
@@ -7919,6 +7976,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
Vec::new(), // delta layers
|
||||
vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
|
||||
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
|
||||
@@ -8006,6 +8064,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
Vec::new(), // delta layers
|
||||
vec![(
|
||||
Lsn(0x20),
|
||||
@@ -8221,6 +8280,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
// delta layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
@@ -8301,6 +8361,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
// delta layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
@@ -8374,6 +8435,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
// delta layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
@@ -8506,6 +8568,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||
@@ -8699,6 +8762,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x40),
|
||||
delta1,
|
||||
@@ -8755,6 +8819,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
Vec::new(),
|
||||
image_layers,
|
||||
end_lsn,
|
||||
@@ -8961,6 +9026,7 @@ mod tests {
|
||||
Lsn(0x08),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x08)..Lsn(0x10),
|
||||
@@ -8979,7 +9045,7 @@ mod tests {
|
||||
delta3,
|
||||
),
|
||||
], // delta layers
|
||||
vec![], // image layers
|
||||
vec![], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?
|
||||
@@ -8990,6 +9056,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x48),
|
||||
@@ -9540,6 +9607,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
|
||||
@@ -9787,6 +9855,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
vec![
|
||||
// delta1 and delta 2 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
|
||||
@@ -10022,6 +10091,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![], // delta layers
|
||||
vec![(Lsn(0x18), img_layer)], // image layers
|
||||
Lsn(0x18),
|
||||
@@ -10268,6 +10338,7 @@ mod tests {
|
||||
baseline_image_layer_lsn,
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
delta_layer_start_lsn..delta_layer_end_lsn,
|
||||
delta_layer_spec,
|
||||
@@ -10299,6 +10370,158 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
|
||||
let harness =
|
||||
TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let will_init_keys = [2, 6];
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let mut expected_key_values = HashMap::new();
|
||||
|
||||
let baseline_image_layer_lsn = Lsn(0x10);
|
||||
let mut baseline_img_layer = Vec::new();
|
||||
for i in 0..5 {
|
||||
let key = get_key(i);
|
||||
let value = format!("value {i}@{baseline_image_layer_lsn}");
|
||||
|
||||
let removed = expected_key_values.insert(key, value.clone());
|
||||
assert!(removed.is_none());
|
||||
|
||||
baseline_img_layer.push((key, Bytes::from(value)));
|
||||
}
|
||||
|
||||
let nested_image_layer_lsn = Lsn(0x50);
|
||||
let mut nested_img_layer = Vec::new();
|
||||
for i in 5..10 {
|
||||
let key = get_key(i);
|
||||
let value = format!("value {i}@{nested_image_layer_lsn}");
|
||||
|
||||
let removed = expected_key_values.insert(key, value.clone());
|
||||
assert!(removed.is_none());
|
||||
|
||||
nested_img_layer.push((key, Bytes::from(value)));
|
||||
}
|
||||
|
||||
let frozen_layer = {
|
||||
let lsn_range = Lsn(0x40)..Lsn(0x60);
|
||||
let mut data = Vec::new();
|
||||
for i in 0..10 {
|
||||
let key = get_key(i);
|
||||
let key_in_nested = nested_img_layer
|
||||
.iter()
|
||||
.any(|(key_with_img, _)| *key_with_img == key);
|
||||
let lsn = {
|
||||
if key_in_nested {
|
||||
Lsn(nested_image_layer_lsn.0 + 5)
|
||||
} else {
|
||||
lsn_range.start
|
||||
}
|
||||
};
|
||||
|
||||
let will_init = will_init_keys.contains(&i);
|
||||
if will_init {
|
||||
data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
|
||||
|
||||
expected_key_values.insert(key, "".to_string());
|
||||
} else {
|
||||
let delta = format!("@{lsn}");
|
||||
data.push((
|
||||
key,
|
||||
lsn,
|
||||
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
|
||||
));
|
||||
|
||||
expected_key_values
|
||||
.get_mut(&key)
|
||||
.expect("An image exists for each key")
|
||||
.push_str(delta.as_str());
|
||||
}
|
||||
}
|
||||
|
||||
InMemoryLayerTestDesc {
|
||||
lsn_range,
|
||||
is_open: false,
|
||||
data,
|
||||
}
|
||||
};
|
||||
|
||||
let (open_layer, last_record_lsn) = {
|
||||
let start_lsn = Lsn(0x70);
|
||||
let mut data = Vec::new();
|
||||
let mut end_lsn = Lsn(0);
|
||||
for i in 0..10 {
|
||||
let key = get_key(i);
|
||||
let lsn = Lsn(start_lsn.0 + i as u64);
|
||||
let delta = format!("@{lsn}");
|
||||
data.push((
|
||||
key,
|
||||
lsn,
|
||||
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
|
||||
));
|
||||
|
||||
expected_key_values
|
||||
.get_mut(&key)
|
||||
.expect("An image exists for each key")
|
||||
.push_str(delta.as_str());
|
||||
|
||||
end_lsn = std::cmp::max(end_lsn, lsn);
|
||||
}
|
||||
|
||||
(
|
||||
InMemoryLayerTestDesc {
|
||||
lsn_range: start_lsn..Lsn::MAX,
|
||||
is_open: true,
|
||||
data,
|
||||
},
|
||||
end_lsn,
|
||||
)
|
||||
};
|
||||
|
||||
assert!(
|
||||
nested_image_layer_lsn > frozen_layer.lsn_range.start
|
||||
&& nested_image_layer_lsn < frozen_layer.lsn_range.end
|
||||
);
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
baseline_image_layer_lsn,
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![open_layer, frozen_layer], // in-memory layers
|
||||
Vec::new(), // delta layers
|
||||
vec![
|
||||
(baseline_image_layer_lsn, baseline_img_layer),
|
||||
(nested_image_layer_lsn, nested_img_layer),
|
||||
], // image layers
|
||||
last_record_lsn,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let keyspace = KeySpace::single(get_key(0)..get_key(10));
|
||||
let results = tline
|
||||
.get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
|
||||
.await
|
||||
.expect("No vectored errors");
|
||||
for (key, res) in results {
|
||||
let value = res.expect("No key errors");
|
||||
let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
|
||||
assert_eq!(value, Bytes::from(expected_value.clone()));
|
||||
|
||||
tracing::info!("key={key} value={expected_value}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
|
||||
(
|
||||
k1.is_delta,
|
||||
@@ -10414,6 +10637,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||
@@ -10798,6 +11022,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![
|
||||
// delta1/2/4 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
|
||||
@@ -11049,6 +11274,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![
|
||||
// delta1/2/4 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
|
||||
|
||||
@@ -382,7 +382,8 @@ pub(crate) mod tests {
|
||||
}
|
||||
|
||||
async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
let (_temp_dir, pathbuf, offsets) =
|
||||
write_maybe_compressed(blobs, compression, &ctx).await?;
|
||||
|
||||
|
||||
@@ -32,8 +32,7 @@ use hex;
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::block_io::{BlockReader, BlockWriter};
|
||||
use crate::virtual_file::{IoBuffer, IoBufferMut, owned_buffers_io::write::Buffer};
|
||||
|
||||
@@ -478,16 +477,15 @@ where
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub async fn dump(&self) -> Result<()> {
|
||||
pub async fn dump(&self, ctx: &RequestContext) -> Result<()> {
|
||||
let mut stack = Vec::new();
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
stack.push((self.root_blk, String::new(), 0, 0, 0));
|
||||
|
||||
let block_cursor = self.reader.block_cursor();
|
||||
|
||||
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
|
||||
let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?;
|
||||
let blk = block_cursor.read_blk(self.start_blk + blknum, ctx).await?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
|
||||
@@ -836,6 +834,8 @@ pub(crate) mod tests {
|
||||
use rand::Rng;
|
||||
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
@@ -870,7 +870,8 @@ pub(crate) mod tests {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
|
||||
let all_keys: Vec<&[u8; 6]> = vec![
|
||||
b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
|
||||
@@ -888,7 +889,7 @@ pub(crate) mod tests {
|
||||
|
||||
let reader = DiskBtreeReader::new(0, root_offset, disk);
|
||||
|
||||
reader.dump().await?;
|
||||
reader.dump(&ctx).await?;
|
||||
|
||||
// Test the `get` function on all the keys.
|
||||
for (key, val) in all_data.iter() {
|
||||
@@ -980,7 +981,8 @@ pub(crate) mod tests {
|
||||
async fn lots_of_keys() -> Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
|
||||
const NUM_KEYS: u64 = 1000;
|
||||
|
||||
@@ -998,7 +1000,7 @@ pub(crate) mod tests {
|
||||
|
||||
let reader = DiskBtreeReader::new(0, root_offset, disk);
|
||||
|
||||
reader.dump().await?;
|
||||
reader.dump(&ctx).await?;
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
@@ -1168,7 +1170,8 @@ pub(crate) mod tests {
|
||||
// Build a tree from it
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
|
||||
for (key, val) in disk_btree_test_data::TEST_DATA {
|
||||
writer.append(&key, val)?;
|
||||
@@ -1199,7 +1202,7 @@ pub(crate) mod tests {
|
||||
.await?;
|
||||
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
|
||||
|
||||
reader.dump().await?;
|
||||
reader.dump(&ctx).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -352,7 +352,8 @@ mod tests {
|
||||
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
|
||||
Ok((conf, tenant_shard_id, timeline_id, ctx))
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
|
||||
/// The latest state
|
||||
head: LayerCoverageTuple<Value>,
|
||||
|
||||
/// TODO: this could be an ordered vec using binary search.
|
||||
/// We push into this map everytime we add a layer, so might see some benefit
|
||||
/// All previous states
|
||||
historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
|
||||
}
|
||||
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
|
||||
buffer: BTreeMap<LayerKey, Option<Value>>,
|
||||
|
||||
/// All current layers. This is not used for search. Only to make rebuilds easier.
|
||||
// TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
|
||||
// [`Self::historic_coverage`] instead of doubling memory usage.
|
||||
// [`Self::len`]: can require rebuild and serve from latest historic
|
||||
// [`Self::iter`]: already requires rebuild => can serve from latest historic
|
||||
layers: BTreeMap<LayerKey, Value>,
|
||||
}
|
||||
|
||||
|
||||
@@ -194,7 +194,7 @@ pub(crate) use download::{
|
||||
};
|
||||
use index::GcCompactionState;
|
||||
pub(crate) use index::LayerFileMetadata;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use regex::Regex;
|
||||
use remote_storage::{
|
||||
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {
|
||||
|
||||
/// Initialize the upload queue for the case where the remote storage is empty,
|
||||
/// i.e., it doesn't have an `IndexPart`.
|
||||
///
|
||||
/// `rel_size_v2_status` needs to be carried over during branching, and that's why
|
||||
/// it's passed in here.
|
||||
pub fn init_upload_queue_for_empty_remote(
|
||||
&self,
|
||||
local_metadata: &TimelineMetadata,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
|
||||
.as_ref()
|
||||
.map_or(0, |r| r.concurrency_limit());
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
let initialized_queue =
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
info!("initialized upload queue as empty");
|
||||
Ok(())
|
||||
@@ -900,7 +906,7 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
|
||||
/// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
|
||||
pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
|
||||
self: &Arc<Self>,
|
||||
gc_compaction_state: GcCompactionState,
|
||||
@@ -912,6 +918,21 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
|
||||
pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
|
||||
self: &Arc<Self>,
|
||||
rel_size_v2_status: RelSizeMigration,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
|
||||
// TODO: allow this operation to bypass the validation check because we might upload the index part
|
||||
// with no layers but the flag updated. For now, we just modify the index part in memory and the next
|
||||
// upload will include the flag.
|
||||
// self.schedule_index_upload(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
@@ -933,6 +954,14 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Only used in the `patch_index_part` HTTP API to force trigger an index upload.
|
||||
pub fn force_schedule_index_upload(self: &Arc<Self>) -> Result<(), NotInitialized> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
self.schedule_index_upload(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background (internal function)
|
||||
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::collections::HashMap;
|
||||
|
||||
use chrono::NaiveDateTime;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::RelSizeMigration;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TimelineId;
|
||||
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
|
||||
pub(crate) last_completed_lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum RelSizeMigration {
|
||||
/// The tenant is using the old rel_size format.
|
||||
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
|
||||
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
|
||||
Legacy,
|
||||
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
|
||||
/// persisted in the index part. The read path will read both formats and merge them.
|
||||
Migrating,
|
||||
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
|
||||
/// in the index part, and the read path will not read the old format.
|
||||
Migrated,
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
|
||||
/// used to understand later versions.
|
||||
|
||||
@@ -491,7 +491,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
let remote_storage = self.remote_storage.clone();
|
||||
let conf = self.tenant_manager.get_conf();
|
||||
let tenant_shard_id = *secondary_state.get_tenant_shard_id();
|
||||
let download_ctx = self.root_ctx.attached_child();
|
||||
let download_ctx = self
|
||||
.root_ctx
|
||||
.attached_child()
|
||||
.with_scope_secondary_tenant(&tenant_shard_id);
|
||||
(RunningDownload { barrier }, Box::pin(async move {
|
||||
let _completion = completion;
|
||||
|
||||
@@ -771,6 +774,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id);
|
||||
let timeline_state = timeline_states
|
||||
.remove(&timeline.timeline_id)
|
||||
.expect("Just populated above");
|
||||
@@ -869,8 +873,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
|
||||
|
||||
let layers_in_heatmap = heatmap_timeline
|
||||
.layers
|
||||
.iter()
|
||||
.hot_layers()
|
||||
.map(|l| (&l.name, l.metadata.generation))
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
@@ -1015,7 +1018,8 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
for layer in timeline.layers {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
for layer in timeline.into_hot_layers() {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return (Err(UpdateError::Cancelled), touched);
|
||||
@@ -1040,7 +1044,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
match self
|
||||
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
|
||||
.download_layer(tenant_shard_id, &timeline_id, layer, ctx)
|
||||
.await
|
||||
{
|
||||
Ok(Some(layer)) => touched.push(layer),
|
||||
@@ -1148,7 +1152,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());
|
||||
|
||||
let (result, touched) = self
|
||||
.download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
|
||||
@@ -1316,11 +1320,11 @@ async fn init_timeline_state(
|
||||
// As we iterate through layers found on disk, we will look up their metadata from this map.
|
||||
// Layers not present in metadata will be discarded.
|
||||
let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
|
||||
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
|
||||
heatmap.hot_layers().map(|l| (&l.name, l)).collect();
|
||||
|
||||
let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
|
||||
if let Some(last_heatmap) = last_heatmap {
|
||||
last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
|
||||
last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
|
||||
} else {
|
||||
HashMap::new()
|
||||
};
|
||||
|
||||
@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
|
||||
pub(crate) layers: Vec<HeatMapLayer>,
|
||||
layers: Vec<HeatMapLayer>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer {
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(crate) access_time: SystemTime,
|
||||
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
|
||||
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
|
||||
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
|
||||
}
|
||||
|
||||
impl HeatMapLayer {
|
||||
@@ -62,11 +64,13 @@ impl HeatMapLayer {
|
||||
name: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
cold: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
metadata,
|
||||
access_time,
|
||||
cold,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -78,6 +82,18 @@ impl HeatMapTimeline {
|
||||
layers,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
|
||||
self.layers.into_iter().filter(|l| !l.cold)
|
||||
}
|
||||
|
||||
pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
|
||||
self.layers.iter().filter(|l| !l.cold)
|
||||
}
|
||||
|
||||
pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
|
||||
self.layers.iter()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct HeatMapStats {
|
||||
@@ -92,7 +108,7 @@ impl HeatMapTenant {
|
||||
layers: 0,
|
||||
};
|
||||
for timeline in &self.timelines {
|
||||
for layer in &timeline.layers {
|
||||
for layer in timeline.hot_layers() {
|
||||
stats.layers += 1;
|
||||
stats.bytes += layer.metadata.file_size;
|
||||
}
|
||||
|
||||
@@ -474,7 +474,7 @@ async fn fill_logical_sizes(
|
||||
if cached_size.is_none() {
|
||||
let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
|
||||
let parallel_size_calcs = Arc::clone(limit);
|
||||
let ctx = ctx.attached_child();
|
||||
let ctx = ctx.attached_child().with_scope_timeline(&timeline);
|
||||
joinset.spawn(
|
||||
calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
|
||||
.in_current_span(),
|
||||
|
||||
@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;
|
||||
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
use super::PageReconstructError;
|
||||
use super::layer_map::InMemoryLayerDesc;
|
||||
use super::timeline::{GetVectoredError, ReadPath};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
@@ -721,6 +722,12 @@ struct LayerToVisitId {
|
||||
lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash)]
|
||||
pub enum ReadableLayerWeak {
|
||||
PersistentLayer(Arc<PersistentLayerDesc>),
|
||||
InMemoryLayer(InMemoryLayerDesc),
|
||||
}
|
||||
|
||||
/// Layer wrapper for the read path. Note that it is valid
|
||||
/// to use these layers even after external operations have
|
||||
/// been performed on them (compaction, freeze, etc.).
|
||||
@@ -873,7 +880,7 @@ impl ReadableLayer {
|
||||
}
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1385,7 +1385,7 @@ impl DeltaLayerInner {
|
||||
block_reader,
|
||||
);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
tree_reader.dump(ctx).await?;
|
||||
|
||||
let keys = self.index_entries(ctx).await?;
|
||||
|
||||
@@ -2024,6 +2024,7 @@ pub(crate) mod test {
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = &ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let initdb_layer = timeline
|
||||
.layers
|
||||
@@ -2136,7 +2137,7 @@ pub(crate) mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let new_layer = new_layer.download_and_keep_resident().await.unwrap();
|
||||
let new_layer = new_layer.download_and_keep_resident(ctx).await.unwrap();
|
||||
|
||||
new_layer
|
||||
.copy_delta_prefix(&mut writer, truncate_at, ctx)
|
||||
|
||||
@@ -208,7 +208,7 @@ impl ImageLayerInner {
|
||||
block_reader,
|
||||
);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
tree_reader.dump(ctx).await?;
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
|
||||
@@ -416,7 +416,7 @@ impl InMemoryLayer {
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
self: &Arc<InMemoryLayer>,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
@@ -433,8 +433,6 @@ impl InMemoryLayer {
|
||||
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
|
||||
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
|
||||
|
||||
let lsn_range = self.start_lsn..end_lsn;
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
for (key, vec_map) in inner
|
||||
.index
|
||||
|
||||
@@ -324,16 +324,16 @@ impl Layer {
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let downloaded = self
|
||||
.0
|
||||
.get_or_maybe_download(true, Some(ctx))
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
|
||||
GetVectoredError::Cancelled
|
||||
}
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
let downloaded =
|
||||
self.0
|
||||
.get_or_maybe_download(true, ctx)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
|
||||
GetVectoredError::Cancelled
|
||||
}
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
let this = ResidentLayer {
|
||||
downloaded: downloaded.clone(),
|
||||
owner: self.clone(),
|
||||
@@ -356,8 +356,8 @@ impl Layer {
|
||||
/// Download the layer if evicted.
|
||||
///
|
||||
/// Will not error when the layer is already downloaded.
|
||||
pub(crate) async fn download(&self) -> Result<(), DownloadError> {
|
||||
self.0.get_or_maybe_download(true, None).await?;
|
||||
pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
|
||||
self.0.get_or_maybe_download(true, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -392,8 +392,11 @@ impl Layer {
|
||||
}
|
||||
|
||||
/// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
|
||||
pub(crate) async fn download_and_keep_resident(&self) -> Result<ResidentLayer, DownloadError> {
|
||||
let downloaded = self.0.get_or_maybe_download(true, None).await?;
|
||||
pub(crate) async fn download_and_keep_resident(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ResidentLayer, DownloadError> {
|
||||
let downloaded = self.0.get_or_maybe_download(true, ctx).await?;
|
||||
|
||||
Ok(ResidentLayer {
|
||||
downloaded,
|
||||
@@ -446,7 +449,7 @@ impl Layer {
|
||||
|
||||
if verbose {
|
||||
// for now, unconditionally download everything, even if that might not be wanted.
|
||||
let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
|
||||
let l = self.0.get_or_maybe_download(true, ctx).await?;
|
||||
l.dump(&self.0, ctx).await?
|
||||
}
|
||||
|
||||
@@ -945,7 +948,7 @@ impl LayerInner {
|
||||
async fn get_or_maybe_download(
|
||||
self: &Arc<Self>,
|
||||
allow_download: bool,
|
||||
ctx: Option<&RequestContext>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<DownloadedLayer>, DownloadError> {
|
||||
let (weak, permit) = {
|
||||
// get_or_init_detached can:
|
||||
@@ -1035,21 +1038,14 @@ impl LayerInner {
|
||||
return Err(DownloadError::NotFile(ft));
|
||||
}
|
||||
|
||||
if let Some(ctx) = ctx {
|
||||
self.check_expected_download(ctx)?;
|
||||
}
|
||||
self.check_expected_download(ctx)?;
|
||||
|
||||
if !allow_download {
|
||||
// this is only used from tests, but it is hard to test without the boolean
|
||||
return Err(DownloadError::DownloadRequired);
|
||||
}
|
||||
|
||||
let download_ctx = ctx
|
||||
.map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
|
||||
.unwrap_or(RequestContext::new(
|
||||
TaskKind::LayerDownload,
|
||||
DownloadBehavior::Download,
|
||||
));
|
||||
let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download);
|
||||
|
||||
async move {
|
||||
tracing::info!(%reason, "downloading on-demand");
|
||||
@@ -1567,10 +1563,10 @@ impl LayerInner {
|
||||
|
||||
self.access_stats.record_residence_event();
|
||||
|
||||
self.status.as_ref().unwrap().send_replace(Status::Evicted);
|
||||
|
||||
*self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
|
||||
|
||||
self.status.as_ref().unwrap().send_replace(Status::Evicted);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ use utils::id::TimelineId;
|
||||
use super::failpoints::{Failpoint, FailpointKind};
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::harness::{TenantHarness, test_img};
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};
|
||||
|
||||
@@ -27,11 +26,9 @@ async fn smoke_test() {
|
||||
let h = TenantHarness::create("smoke_test").await.unwrap();
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
let (tenant, _) = h.load().await;
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let io_concurrency = IoConcurrency::spawn_for_test();
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
|
||||
|
||||
let image_layers = vec![(
|
||||
Lsn(0x40),
|
||||
vec![(
|
||||
@@ -49,12 +46,14 @@ async fn smoke_test() {
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
Default::default(), // in-memory layers
|
||||
Default::default(),
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = &ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// Grab one of the timeline's layers to exercise in the test, and the other layer that is just
|
||||
// there to avoid the timeline being illegally empty
|
||||
@@ -93,7 +92,7 @@ async fn smoke_test() {
|
||||
controlfile_keyspace.clone(),
|
||||
Lsn(0x10)..Lsn(0x11),
|
||||
&mut data,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -128,7 +127,7 @@ async fn smoke_test() {
|
||||
controlfile_keyspace.clone(),
|
||||
Lsn(0x10)..Lsn(0x11),
|
||||
&mut data,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.instrument(download_span.clone())
|
||||
.await
|
||||
@@ -178,7 +177,7 @@ async fn smoke_test() {
|
||||
|
||||
// plain downloading is rarely needed
|
||||
layer
|
||||
.download_and_keep_resident()
|
||||
.download_and_keep_resident(ctx)
|
||||
.instrument(download_span)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -340,6 +339,7 @@ fn read_wins_pending_eviction() {
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
@@ -379,7 +379,7 @@ fn read_wins_pending_eviction() {
|
||||
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
|
||||
layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.get_or_maybe_download(false, &ctx)
|
||||
.instrument(download_span)
|
||||
.await
|
||||
.expect("should had reinitialized without downloading");
|
||||
@@ -472,6 +472,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
@@ -514,7 +515,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
|
||||
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
|
||||
layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.get_or_maybe_download(false, &ctx)
|
||||
.instrument(download_span)
|
||||
.await
|
||||
.expect("should had reinitialized without downloading");
|
||||
@@ -641,7 +642,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// This test does downloads
|
||||
let ctx = RequestContextBuilder::extend(&ctx)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.build();
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
@@ -674,7 +680,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
// simulate a cancelled read which is cancelled before it gets to re-initialize
|
||||
let e = layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.get_or_maybe_download(false, &ctx)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
@@ -698,7 +704,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
// failpoint is still enabled, but it is not hit
|
||||
let e = layer
|
||||
.0
|
||||
.get_or_maybe_download(false, None)
|
||||
.get_or_maybe_download(false, &ctx)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
|
||||
@@ -721,6 +727,12 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// This test does downloads
|
||||
let ctx = RequestContextBuilder::extend(&ctx)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.build();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
@@ -768,7 +780,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
let mut download = std::pin::pin!(
|
||||
layer
|
||||
.0
|
||||
.get_or_maybe_download(true, None)
|
||||
.get_or_maybe_download(true, &ctx)
|
||||
.instrument(download_span)
|
||||
);
|
||||
|
||||
|
||||
@@ -289,15 +289,14 @@ fn log_compaction_error(
|
||||
) {
|
||||
use CompactionError::*;
|
||||
|
||||
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
|
||||
let level = match err {
|
||||
e if e.is_cancel() => return,
|
||||
ShuttingDown => return,
|
||||
Offload(_) => Level::ERROR,
|
||||
AlreadyRunning(_) => Level::ERROR,
|
||||
CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
|
||||
CollectKeySpaceError(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
@@ -474,21 +473,15 @@ async fn wait_for_active_tenant(
|
||||
}
|
||||
|
||||
let mut update_rx = tenant.subscribe_for_state_updates();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => return ControlFlow::Break(()),
|
||||
result = update_rx.changed() => if result.is_err() {
|
||||
tokio::select! {
|
||||
result = update_rx.wait_for(|s| s == &TenantState::Active) => {
|
||||
if result.is_err() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
}
|
||||
|
||||
match &*update_rx.borrow() {
|
||||
TenantState::Active => {
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
state => debug!("Not running the task loop, tenant is not active: {state:?}"),
|
||||
}
|
||||
debug!("Tenant state changed to active, continuing the task loop");
|
||||
ControlFlow::Continue(())
|
||||
},
|
||||
_ = cancel.cancelled() => ControlFlow::Break(()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
|
||||
use pageserver_api::models::{
|
||||
CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
|
||||
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
|
||||
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
|
||||
};
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag};
|
||||
use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
|
||||
@@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate,
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::l0_flush::{self, L0FlushGlobalState};
|
||||
use crate::metrics::{
|
||||
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
|
||||
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
|
||||
LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
|
||||
};
|
||||
use crate::page_service::TenantManagerTypes;
|
||||
use crate::pgdatadir_mapping::{
|
||||
@@ -286,7 +287,7 @@ pub struct Timeline {
|
||||
// The LSN of gc-compaction that was last applied to this timeline.
|
||||
gc_compaction_state: ArcSwap<Option<GcCompactionState>>,
|
||||
|
||||
pub(super) metrics: TimelineMetrics,
|
||||
pub(crate) metrics: Arc<TimelineMetrics>,
|
||||
|
||||
// `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code
|
||||
// in `crate::page_service` writes these metrics.
|
||||
@@ -436,12 +437,16 @@ pub struct Timeline {
|
||||
/// May host a background Tokio task which downloads all the layers from the current
|
||||
/// heatmap on demand.
|
||||
heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
|
||||
|
||||
pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
|
||||
}
|
||||
|
||||
pub(crate) enum PreviousHeatmap {
|
||||
Active {
|
||||
heatmap: HeatMapTimeline,
|
||||
read_at: std::time::Instant,
|
||||
// End LSN covered by the heatmap if known
|
||||
end_lsn: Option<Lsn>,
|
||||
},
|
||||
Obsolete,
|
||||
}
|
||||
@@ -1326,10 +1331,6 @@ impl Timeline {
|
||||
// (this is a requirement, not a bug). Skip updating the metric in these cases
|
||||
// to avoid infinite results.
|
||||
if !results.is_empty() {
|
||||
// Record the total number of layers visited towards each key in the batch. While some
|
||||
// layers may not intersect with a given read, and the cost of layer visits are
|
||||
// amortized across the batch, each visited layer contributes directly to the observed
|
||||
// latency for every read in the batch, which is what we care about.
|
||||
if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
|
||||
static LOG_PACER: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
|
||||
@@ -1344,9 +1345,23 @@ impl Timeline {
|
||||
});
|
||||
}
|
||||
|
||||
// Records the number of layers visited in a few different ways:
|
||||
//
|
||||
// * LAYERS_PER_READ: all layers count towards every read in the batch, because each
|
||||
// layer directly affects its observed latency.
|
||||
//
|
||||
// * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
|
||||
// layer visits and access cost.
|
||||
//
|
||||
// * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
|
||||
// read amplification after batching.
|
||||
let layers_visited = layers_visited as f64;
|
||||
let avg_layers_visited = layers_visited / results.len() as f64;
|
||||
LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
|
||||
for _ in &results {
|
||||
self.metrics.layers_per_read.observe(layers_visited as f64);
|
||||
LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
|
||||
self.metrics.layers_per_read.observe(layers_visited);
|
||||
LAYERS_PER_READ_GLOBAL.observe(layers_visited);
|
||||
LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1864,16 +1879,25 @@ impl Timeline {
|
||||
};
|
||||
|
||||
// Signal compaction failure to avoid L0 flush stalls when it's broken.
|
||||
match result {
|
||||
match &result {
|
||||
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
|
||||
Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
|
||||
Err(e) if e.is_cancel() => {}
|
||||
Err(CompactionError::ShuttingDown) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::AlreadyRunning(_)) => {
|
||||
// Covered by the `Err(e) if e.is_cancel()` branch.
|
||||
}
|
||||
Err(CompactionError::Other(_)) => {
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
Err(CompactionError::CollectKeySpaceError(_)) => {
|
||||
// Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
// Don't change the current value on offload failure or shutdown. We don't want to
|
||||
// abruptly stall nor resume L0 flushes in these cases.
|
||||
Err(CompactionError::Offload(_)) => {}
|
||||
Err(CompactionError::ShuttingDown) => {}
|
||||
Err(CompactionError::AlreadyRunning(_)) => {}
|
||||
};
|
||||
|
||||
result
|
||||
@@ -2188,6 +2212,7 @@ impl Timeline {
|
||||
pub(crate) async fn download_layer(
|
||||
&self,
|
||||
layer_file_name: &LayerName,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
|
||||
let Some(layer) = self
|
||||
.find_layer(layer_file_name)
|
||||
@@ -2201,7 +2226,7 @@ impl Timeline {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
layer.download().await?;
|
||||
layer.download(ctx).await?;
|
||||
|
||||
Ok(Some(true))
|
||||
}
|
||||
@@ -2356,6 +2381,9 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
|
||||
}
|
||||
|
||||
/// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
|
||||
/// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
|
||||
/// possible that the index part persists the state while the config doesn't get persisted.
|
||||
pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2364,6 +2392,14 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
|
||||
}
|
||||
|
||||
pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
|
||||
self.rel_size_v2_status
|
||||
.load()
|
||||
.as_ref()
|
||||
.map(|s| s.as_ref().clone())
|
||||
.unwrap_or(RelSizeMigration::Legacy)
|
||||
}
|
||||
|
||||
fn get_compaction_upper_limit(&self) -> usize {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2624,6 +2660,7 @@ impl Timeline {
|
||||
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
|
||||
create_idempotency: crate::tenant::CreateTimelineIdempotency,
|
||||
gc_compaction_state: Option<GcCompactionState>,
|
||||
rel_size_v2_status: Option<RelSizeMigration>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
@@ -2648,14 +2685,14 @@ impl Timeline {
|
||||
}
|
||||
|
||||
Arc::new_cyclic(|myself| {
|
||||
let metrics = TimelineMetrics::new(
|
||||
let metrics = Arc::new(TimelineMetrics::new(
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
|
||||
"mtime",
|
||||
evictions_low_residence_duration_metric_threshold,
|
||||
),
|
||||
);
|
||||
));
|
||||
let aux_file_metrics = metrics.aux_file_size_gauge.clone();
|
||||
|
||||
let mut result = Timeline {
|
||||
@@ -2782,6 +2819,8 @@ impl Timeline {
|
||||
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
|
||||
|
||||
heatmap_layers_downloader: Mutex::new(None),
|
||||
|
||||
rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -2837,7 +2876,7 @@ impl Timeline {
|
||||
"layer flush task",
|
||||
async move {
|
||||
let _guard = guard;
|
||||
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
|
||||
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error).with_scope_timeline(&self_clone);
|
||||
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
|
||||
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
|
||||
assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
|
||||
@@ -2858,6 +2897,16 @@ impl Timeline {
|
||||
.schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
|
||||
}
|
||||
|
||||
pub(crate) fn update_rel_size_v2_status(
|
||||
&self,
|
||||
rel_size_v2_status: RelSizeMigration,
|
||||
) -> anyhow::Result<()> {
|
||||
self.rel_size_v2_status
|
||||
.store(Some(Arc::new(rel_size_v2_status.clone())));
|
||||
self.remote_client
|
||||
.schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
|
||||
}
|
||||
|
||||
pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
|
||||
self.gc_compaction_state.load_full().as_ref().clone()
|
||||
}
|
||||
@@ -3560,12 +3609,16 @@ impl Timeline {
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
pub(super) fn is_previous_heatmap_active(&self) -> bool {
|
||||
self.previous_heatmap
|
||||
.load()
|
||||
.as_ref()
|
||||
.map(|prev| matches!(**prev, PreviousHeatmap::Active { .. }))
|
||||
.unwrap_or(false)
|
||||
pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool {
|
||||
let crnt = self.previous_heatmap.load();
|
||||
match crnt.as_deref() {
|
||||
Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn {
|
||||
Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn,
|
||||
None => true,
|
||||
},
|
||||
Some(PreviousHeatmap::Obsolete) => false,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// The timeline heatmap is a hint to secondary locations from the primary location,
|
||||
@@ -3593,26 +3646,26 @@ impl Timeline {
|
||||
// heatamp.
|
||||
let previous_heatmap = self.previous_heatmap.load();
|
||||
let visible_non_resident = match previous_heatmap.as_deref() {
|
||||
Some(PreviousHeatmap::Active { heatmap, read_at }) => {
|
||||
Some(heatmap.layers.iter().filter_map(|hl| {
|
||||
let desc: PersistentLayerDesc = hl.name.clone().into();
|
||||
let layer = guard.try_get_from_key(&desc.key())?;
|
||||
Some(PreviousHeatmap::Active {
|
||||
heatmap, read_at, ..
|
||||
}) => Some(heatmap.all_layers().filter_map(|hl| {
|
||||
let desc: PersistentLayerDesc = hl.name.clone().into();
|
||||
let layer = guard.try_get_from_key(&desc.key())?;
|
||||
|
||||
if layer.visibility() == LayerVisibilityHint::Covered {
|
||||
return None;
|
||||
}
|
||||
if layer.visibility() == LayerVisibilityHint::Covered {
|
||||
return None;
|
||||
}
|
||||
|
||||
if layer.is_likely_resident() {
|
||||
return None;
|
||||
}
|
||||
if layer.is_likely_resident() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if layer.last_evicted_at().happened_after(*read_at) {
|
||||
return None;
|
||||
}
|
||||
if layer.last_evicted_at().happened_after(*read_at) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some((desc, hl.metadata.clone(), hl.access_time))
|
||||
}))
|
||||
}
|
||||
Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
|
||||
})),
|
||||
Some(PreviousHeatmap::Obsolete) => None,
|
||||
None => None,
|
||||
};
|
||||
@@ -3627,6 +3680,7 @@ impl Timeline {
|
||||
layer.layer_desc().clone(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
false, // these layers are not cold
|
||||
))
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
@@ -3653,12 +3707,14 @@ impl Timeline {
|
||||
// Sort layers in order of which to download first. For a large set of layers to download, we
|
||||
// want to prioritize those layers which are most likely to still be in the resident many minutes
|
||||
// or hours later:
|
||||
// - Cold layers go last for convenience when a human inspects the heatmap.
|
||||
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
|
||||
// only exist for a few minutes before being compacted into L1s.
|
||||
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
|
||||
// the layer is likely to be covered by an image layer during compaction.
|
||||
layers.sort_by_key(|(desc, _meta, _atime)| {
|
||||
layers.sort_by_key(|(desc, _meta, _atime, cold)| {
|
||||
std::cmp::Reverse((
|
||||
*cold,
|
||||
!LayerMap::is_l0(&desc.key_range, desc.is_delta),
|
||||
desc.lsn_range.end,
|
||||
))
|
||||
@@ -3666,7 +3722,9 @@ impl Timeline {
|
||||
|
||||
let layers = layers
|
||||
.into_iter()
|
||||
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
|
||||
.map(|(desc, meta, atime, cold)| {
|
||||
HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
|
||||
})
|
||||
.collect();
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
@@ -3686,6 +3744,7 @@ impl Timeline {
|
||||
name: vl.layer_desc().layer_name(),
|
||||
metadata: vl.metadata(),
|
||||
access_time: now,
|
||||
cold: true,
|
||||
};
|
||||
heatmap_layers.push(hl);
|
||||
}
|
||||
@@ -3699,6 +3758,7 @@ impl Timeline {
|
||||
PreviousHeatmap::Active {
|
||||
heatmap,
|
||||
read_at: Instant::now(),
|
||||
end_lsn: Some(end_lsn),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3897,39 +3957,22 @@ impl Timeline {
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map()?;
|
||||
|
||||
let in_memory_layer = layers.find_in_memory_layer(|l| {
|
||||
let start_lsn = l.get_lsn_range().start;
|
||||
cont_lsn > start_lsn
|
||||
});
|
||||
for range in unmapped_keyspace.ranges.iter() {
|
||||
let results = layers.range_search(range.clone(), cont_lsn);
|
||||
|
||||
match in_memory_layer {
|
||||
Some(l) => {
|
||||
let lsn_range = l.get_lsn_range().start..cont_lsn;
|
||||
fringe.update(
|
||||
ReadableLayer::InMemoryLayer(l),
|
||||
unmapped_keyspace.clone(),
|
||||
lsn_range,
|
||||
);
|
||||
}
|
||||
None => {
|
||||
for range in unmapped_keyspace.ranges.iter() {
|
||||
let results = layers.range_search(range.clone(), cont_lsn);
|
||||
|
||||
results
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
}
|
||||
}
|
||||
results
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
|
||||
(
|
||||
guard.upgrade(layer),
|
||||
keyspace_accum.to_keyspace(),
|
||||
lsn_floor..cont_lsn,
|
||||
)
|
||||
})
|
||||
.for_each(|(layer, keyspace, lsn_range)| {
|
||||
fringe.update(layer, keyspace, lsn_range)
|
||||
});
|
||||
}
|
||||
|
||||
// It's safe to drop the layer map lock after planning the next round of reads.
|
||||
@@ -4202,10 +4245,6 @@ impl Timeline {
|
||||
|
||||
// Stall flushes to backpressure if compaction can't keep up. This is propagated up
|
||||
// to WAL ingestion by having ephemeral layer rolls wait for flushes.
|
||||
//
|
||||
// NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so
|
||||
// we can end up stalling before compaction even starts. Consider making it more
|
||||
// responsive (e.g. via `watch_level0_deltas`).
|
||||
if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() {
|
||||
if l0_count >= stall_threshold {
|
||||
warn!(
|
||||
@@ -4693,10 +4732,7 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
let (dense_ks, sparse_ks) = self
|
||||
.collect_keyspace(lsn, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::CollectKeySpaceError)?;
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
@@ -5423,13 +5459,42 @@ pub(crate) enum CompactionError {
|
||||
Offload(OffloadError),
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
#[error("Failed to collect keyspace: {0}")]
|
||||
CollectKeySpaceError(CollectKeySpaceError),
|
||||
CollectKeySpaceError(#[from] CollectKeySpaceError),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
#[error("Compaction already running: {0}")]
|
||||
AlreadyRunning(&'static str),
|
||||
}
|
||||
|
||||
impl CompactionError {
|
||||
/// Errors that can be ignored, i.e., cancel and shutdown.
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::ShuttingDown
|
||||
| Self::AlreadyRunning(_)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
|
||||
| Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::Cancelled
|
||||
))
|
||||
| Self::Offload(OffloadError::Cancelled)
|
||||
)
|
||||
}
|
||||
|
||||
/// Critical errors that indicate data corruption.
|
||||
pub fn is_critical(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Self::CollectKeySpaceError(
|
||||
CollectKeySpaceError::Decode(_)
|
||||
| CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OffloadError> for CompactionError {
|
||||
fn from(e: OffloadError) -> Self {
|
||||
match e {
|
||||
@@ -5439,18 +5504,6 @@ impl From<OffloadError> for CompactionError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CollectKeySpaceError> for CompactionError {
|
||||
fn from(err: CollectKeySpaceError) -> Self {
|
||||
match err {
|
||||
CollectKeySpaceError::Cancelled
|
||||
| CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => {
|
||||
CompactionError::ShuttingDown
|
||||
}
|
||||
e => CompactionError::Other(e.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<super::upload_queue::NotInitialized> for CompactionError {
|
||||
fn from(value: super::upload_queue::NotInitialized) -> Self {
|
||||
match value {
|
||||
@@ -5534,6 +5587,14 @@ pub struct DeltaLayerTestDesc {
|
||||
pub data: Vec<(Key, Lsn, Value)>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[derive(Clone)]
|
||||
pub struct InMemoryLayerTestDesc {
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub data: Vec<(Key, Lsn, Value)>,
|
||||
pub is_open: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl DeltaLayerTestDesc {
|
||||
pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
|
||||
@@ -6193,6 +6254,7 @@ impl Timeline {
|
||||
pub(crate) async fn spawn_download_all_remote_layers(
|
||||
self: Arc<Self>,
|
||||
request: DownloadRemoteLayersTaskSpawnRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskInfo> {
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskState;
|
||||
|
||||
@@ -6213,6 +6275,10 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let self_clone = Arc::clone(&self);
|
||||
let task_ctx = ctx.detached_child(
|
||||
TaskKind::DownloadAllRemoteLayers,
|
||||
DownloadBehavior::Download,
|
||||
);
|
||||
let task_id = task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
task_mgr::TaskKind::DownloadAllRemoteLayers,
|
||||
@@ -6220,7 +6286,7 @@ impl Timeline {
|
||||
Some(self.timeline_id),
|
||||
"download all remote layers task",
|
||||
async move {
|
||||
self_clone.download_all_remote_layers(request).await;
|
||||
self_clone.download_all_remote_layers(request, &task_ctx).await;
|
||||
let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
|
||||
match &mut *status_guard {
|
||||
None => {
|
||||
@@ -6255,6 +6321,7 @@ impl Timeline {
|
||||
async fn download_all_remote_layers(
|
||||
self: &Arc<Self>,
|
||||
request: DownloadRemoteLayersTaskSpawnRequest,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskState;
|
||||
|
||||
@@ -6311,9 +6378,10 @@ impl Timeline {
|
||||
|
||||
let span = tracing::info_span!("download", layer = %next);
|
||||
|
||||
let ctx = ctx.attached_child();
|
||||
js.spawn(
|
||||
async move {
|
||||
let res = next.download().await;
|
||||
let res = next.download(&ctx).await;
|
||||
(next, res)
|
||||
}
|
||||
.instrument(span),
|
||||
@@ -6541,6 +6609,92 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Force create an in-memory layer and place them into the layer map.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn force_create_in_memory_layer(
|
||||
self: &Arc<Timeline>,
|
||||
mut in_memory: InMemoryLayerTestDesc,
|
||||
check_start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
// Validate LSNs
|
||||
if let Some(check_start_lsn) = check_start_lsn {
|
||||
assert!(in_memory.lsn_range.start >= check_start_lsn);
|
||||
}
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
let layer_end_lsn = if in_memory.is_open {
|
||||
in_memory
|
||||
.data
|
||||
.iter()
|
||||
.map(|(_key, lsn, _value)| lsn)
|
||||
.max()
|
||||
.cloned()
|
||||
} else {
|
||||
Some(in_memory.lsn_range.end)
|
||||
};
|
||||
|
||||
if let Some(end) = layer_end_lsn {
|
||||
assert!(
|
||||
end <= last_record_lsn,
|
||||
"advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
|
||||
end,
|
||||
last_record_lsn,
|
||||
);
|
||||
}
|
||||
|
||||
in_memory.data.iter().for_each(|(_key, lsn, _value)| {
|
||||
assert!(*lsn >= in_memory.lsn_range.start);
|
||||
assert!(*lsn < in_memory.lsn_range.end);
|
||||
});
|
||||
|
||||
// Build the batch
|
||||
in_memory
|
||||
.data
|
||||
.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
|
||||
|
||||
let data = in_memory
|
||||
.data
|
||||
.into_iter()
|
||||
.map(|(key, lsn, value)| {
|
||||
let value_size = value.serialized_size().unwrap() as usize;
|
||||
(key.to_compact(), lsn, value_size, value)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let batch = SerializedValueBatch::from_values(data);
|
||||
|
||||
// Create the in-memory layer and write the batch into it
|
||||
let layer = InMemoryLayer::create(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
in_memory.lsn_range.start,
|
||||
&self.gate,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
layer.put_batch(batch, ctx).await.unwrap();
|
||||
if !in_memory.is_open {
|
||||
layer.freeze(in_memory.lsn_range.end).await;
|
||||
}
|
||||
|
||||
info!("force created in-memory layer {:?}", in_memory.lsn_range);
|
||||
|
||||
// Link the layer to the layer map
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let layer_map = guard.open_mut().unwrap();
|
||||
layer_map.force_insert_in_memory_layer(Arc::new(layer));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return all keys at the LSN in the image layers
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn inspect_image_layers(
|
||||
@@ -6900,11 +7054,13 @@ mod tests {
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use std::iter::Iterator;
|
||||
use tracing::Instrument;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::HeatMapTimeline;
|
||||
use crate::context::RequestContextBuilder;
|
||||
use crate::tenant::harness::{TenantHarness, test_img};
|
||||
use crate::tenant::layer_map::LayerMap;
|
||||
use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
|
||||
@@ -6912,8 +7068,8 @@ mod tests {
|
||||
use crate::tenant::{PreviousHeatmap, Timeline};
|
||||
|
||||
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
|
||||
assert_eq!(lhs.layers.len(), rhs.layers.len());
|
||||
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
|
||||
assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
|
||||
let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
|
||||
for (l, r) in lhs_rhs {
|
||||
assert_eq!(l.name, r.name);
|
||||
assert_eq!(l.metadata, r.metadata);
|
||||
@@ -6972,12 +7128,14 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
delta_layers,
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let ctx = &ctx.with_scope_timeline(&timeline);
|
||||
|
||||
// Layer visibility is an input to heatmap generation, so refresh it first
|
||||
timeline.update_layer_visibility().await.unwrap();
|
||||
@@ -6990,10 +7148,11 @@ mod tests {
|
||||
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
|
||||
|
||||
// L0 should come last
|
||||
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
|
||||
let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
|
||||
assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());
|
||||
|
||||
let mut last_lsn = Lsn::MAX;
|
||||
for layer in &heatmap.layers {
|
||||
for layer in heatmap_layers {
|
||||
// Covered layer should be omitted
|
||||
assert!(layer.name != covered_delta.layer_name());
|
||||
|
||||
@@ -7026,6 +7185,7 @@ mod tests {
|
||||
.store(Some(Arc::new(PreviousHeatmap::Active {
|
||||
heatmap: heatmap.clone(),
|
||||
read_at: std::time::Instant::now(),
|
||||
end_lsn: None,
|
||||
})));
|
||||
|
||||
// Generate a new heatmap and assert that it contains the same layers as the old one.
|
||||
@@ -7041,8 +7201,12 @@ mod tests {
|
||||
|
||||
eprintln!("Downloading {layer} and re-generating heatmap");
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
.download_behavior(crate::context::DownloadBehavior::Download)
|
||||
.build();
|
||||
|
||||
let _resident = layer
|
||||
.download_and_keep_resident()
|
||||
.download_and_keep_resident(ctx)
|
||||
.instrument(tracing::info_span!(
|
||||
parent: None,
|
||||
"download_layer",
|
||||
@@ -7100,6 +7264,7 @@ mod tests {
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
Vec::new(), // in-memory layers
|
||||
delta_layers,
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
@@ -7116,7 +7281,7 @@ mod tests {
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
// Both layers should be in the heatmap
|
||||
assert!(!heatmap.layers.is_empty());
|
||||
assert!(heatmap.all_layers().count() > 0);
|
||||
|
||||
// Now simulate a migration.
|
||||
timeline
|
||||
@@ -7124,6 +7289,7 @@ mod tests {
|
||||
.store(Some(Arc::new(PreviousHeatmap::Active {
|
||||
heatmap: heatmap.clone(),
|
||||
read_at: std::time::Instant::now(),
|
||||
end_lsn: None,
|
||||
})));
|
||||
|
||||
// Evict all the layers in the previous heatmap
|
||||
@@ -7141,7 +7307,7 @@ mod tests {
|
||||
.await
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
assert!(post_eviction_heatmap.layers.is_empty());
|
||||
assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
|
||||
assert!(matches!(
|
||||
timeline.previous_heatmap.load().as_deref(),
|
||||
Some(PreviousHeatmap::Obsolete)
|
||||
|
||||
@@ -7,11 +7,20 @@
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, anyhow, bail};
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
|
||||
GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
|
||||
@@ -31,15 +40,8 @@ use utils::critical;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
|
||||
GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError,
|
||||
RecordedDuration, Timeline,
|
||||
};
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::gc_block::GcBlock;
|
||||
@@ -213,30 +215,39 @@ impl GcCompactionQueue {
|
||||
}
|
||||
|
||||
/// Trigger an auto compaction.
|
||||
pub async fn trigger_auto_compaction(&self, timeline: &Arc<Timeline>) {
|
||||
pub async fn trigger_auto_compaction(
|
||||
&self,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> Result<(), CompactionError> {
|
||||
let GcCompactionCombinedSettings {
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
} = timeline.get_gc_compaction_settings();
|
||||
if !gc_compaction_enabled {
|
||||
return;
|
||||
return Ok(());
|
||||
}
|
||||
if self.remaining_jobs_num() > 0 {
|
||||
// Only schedule auto compaction when the queue is empty
|
||||
return;
|
||||
return Ok(());
|
||||
}
|
||||
if timeline.ancestor_timeline().is_some() {
|
||||
// Do not trigger auto compaction for child timelines. We haven't tested
|
||||
// it enough in staging yet.
|
||||
return;
|
||||
return Ok(());
|
||||
}
|
||||
if timeline.get_gc_compaction_watermark() == Lsn::INVALID {
|
||||
// If the gc watermark is not set, we don't need to trigger auto compaction.
|
||||
// This check is the same as in `gc_compaction_split_jobs` but we don't log
|
||||
// here and we can also skip the computation of the trigger condition earlier.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
|
||||
// Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
|
||||
// the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger`
|
||||
// to ensure the fairness while avoid starving other tasks.
|
||||
return;
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let gc_compaction_state = timeline.get_gc_compaction_state();
|
||||
@@ -246,7 +257,7 @@ impl GcCompactionQueue {
|
||||
|
||||
let layers = {
|
||||
let guard = timeline.layers.read().await;
|
||||
let layer_map = guard.layer_map().unwrap();
|
||||
let layer_map = guard.layer_map()?;
|
||||
layer_map.iter_historic_layers().collect_vec()
|
||||
};
|
||||
let mut l2_size: u64 = 0;
|
||||
@@ -318,11 +329,12 @@ impl GcCompactionQueue {
|
||||
l1_size, l2_size, l2_lsn, gc_cutoff
|
||||
);
|
||||
} else {
|
||||
info!(
|
||||
debug!(
|
||||
"did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
|
||||
l1_size, l2_size, l2_lsn, gc_cutoff
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Notify the caller the job has finished and unblock GC.
|
||||
@@ -353,8 +365,7 @@ impl GcCompactionQueue {
|
||||
GcCompactJob::from_compact_options(options.clone()),
|
||||
options.sub_compaction_max_job_size_mb,
|
||||
)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
.await?;
|
||||
if jobs.is_empty() {
|
||||
info!("no jobs to run, skipping scheduled compaction task");
|
||||
self.notify_and_unblock(id);
|
||||
@@ -433,6 +444,7 @@ impl GcCompactionQueue {
|
||||
));
|
||||
};
|
||||
let has_pending_tasks;
|
||||
let mut yield_for_l0 = false;
|
||||
let Some((id, item)) = ({
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
if let Some((id, item)) = guard.queued.pop_front() {
|
||||
@@ -444,7 +456,7 @@ impl GcCompactionQueue {
|
||||
None
|
||||
}
|
||||
}) else {
|
||||
self.trigger_auto_compaction(timeline).await;
|
||||
self.trigger_auto_compaction(timeline).await?;
|
||||
// Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we
|
||||
// have not implemented preemption mechanism yet. We always want to yield it to more important
|
||||
// tasks if there is one.
|
||||
@@ -482,13 +494,23 @@ impl GcCompactionQueue {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
|
||||
}
|
||||
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
let compaction_result =
|
||||
timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
self.notify_and_unblock(id);
|
||||
if compaction_result == CompactionOutcome::YieldForL0 {
|
||||
yield_for_l0 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
GcCompactionQueueItem::SubCompactionJob(options) => {
|
||||
// TODO: error handling, clear the queue if any task fails?
|
||||
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
if compaction_result == CompactionOutcome::YieldForL0 {
|
||||
// We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
|
||||
// again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
|
||||
// we need to clean things up before returning from the function.
|
||||
yield_for_l0 = true;
|
||||
}
|
||||
}
|
||||
GcCompactionQueueItem::Notify(id, l2_lsn) => {
|
||||
self.notify_and_unblock(id);
|
||||
@@ -517,7 +539,10 @@ impl GcCompactionQueue {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.running = None;
|
||||
}
|
||||
Ok(if has_pending_tasks {
|
||||
Ok(if yield_for_l0 {
|
||||
tracing::info!("give up gc-compaction: yield for L0 compaction");
|
||||
CompactionOutcome::YieldForL0
|
||||
} else if has_pending_tasks {
|
||||
CompactionOutcome::Pending
|
||||
} else {
|
||||
CompactionOutcome::Done
|
||||
@@ -716,17 +741,41 @@ struct CompactionStatisticsNumSize {
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
pub struct CompactionStatistics {
|
||||
/// Delta layer visited (maybe compressed, physical size)
|
||||
delta_layer_visited: CompactionStatisticsNumSize,
|
||||
/// Image layer visited (maybe compressed, physical size)
|
||||
image_layer_visited: CompactionStatisticsNumSize,
|
||||
/// Delta layer produced (maybe compressed, physical size)
|
||||
delta_layer_produced: CompactionStatisticsNumSize,
|
||||
/// Image layer produced (maybe compressed, physical size)
|
||||
image_layer_produced: CompactionStatisticsNumSize,
|
||||
num_delta_layer_discarded: usize,
|
||||
num_image_layer_discarded: usize,
|
||||
/// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
|
||||
delta_layer_discarded: CompactionStatisticsNumSize,
|
||||
/// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
|
||||
image_layer_discarded: CompactionStatisticsNumSize,
|
||||
num_unique_keys_visited: usize,
|
||||
/// Delta visited (uncompressed, original size)
|
||||
wal_keys_visited: CompactionStatisticsNumSize,
|
||||
/// Image visited (uncompressed, original size)
|
||||
image_keys_visited: CompactionStatisticsNumSize,
|
||||
/// Delta produced (uncompressed, original size)
|
||||
wal_produced: CompactionStatisticsNumSize,
|
||||
/// Image produced (uncompressed, original size)
|
||||
image_produced: CompactionStatisticsNumSize,
|
||||
|
||||
// Time spent in each phase
|
||||
time_acquire_lock_secs: f64,
|
||||
time_analyze_secs: f64,
|
||||
time_download_layer_secs: f64,
|
||||
time_main_loop_secs: f64,
|
||||
time_final_phase_secs: f64,
|
||||
time_total_secs: f64,
|
||||
|
||||
// Summary
|
||||
/// Ratio of the key-value size before/after gc-compaction.
|
||||
uncompressed_size_ratio: f64,
|
||||
/// Ratio of the physical size before/after gc-compaction.
|
||||
physical_size_ratio: f64,
|
||||
}
|
||||
|
||||
impl CompactionStatistics {
|
||||
@@ -776,11 +825,13 @@ impl CompactionStatistics {
|
||||
self.image_produced.num += 1;
|
||||
self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn discard_delta_layer(&mut self) {
|
||||
self.num_delta_layer_discarded += 1;
|
||||
fn discard_delta_layer(&mut self, original_size: u64) {
|
||||
self.delta_layer_discarded.num += 1;
|
||||
self.delta_layer_discarded.size += original_size;
|
||||
}
|
||||
fn discard_image_layer(&mut self) {
|
||||
self.num_image_layer_discarded += 1;
|
||||
fn discard_image_layer(&mut self, original_size: u64) {
|
||||
self.image_layer_discarded.num += 1;
|
||||
self.image_layer_discarded.size += original_size;
|
||||
}
|
||||
fn produce_delta_layer(&mut self, size: u64) {
|
||||
self.delta_layer_produced.num += 1;
|
||||
@@ -790,6 +841,19 @@ impl CompactionStatistics {
|
||||
self.image_layer_produced.num += 1;
|
||||
self.image_layer_produced.size += size;
|
||||
}
|
||||
fn finalize(&mut self) {
|
||||
let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
|
||||
let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
|
||||
self.uncompressed_size_ratio =
|
||||
original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
|
||||
let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
|
||||
let produced_physical_size = self.image_layer_produced.size
|
||||
+ self.delta_layer_produced.size
|
||||
+ self.image_layer_discarded.size
|
||||
+ self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
|
||||
self.physical_size_ratio =
|
||||
original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -822,9 +886,7 @@ impl Timeline {
|
||||
.flags
|
||||
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
|
||||
{
|
||||
self.compact_with_gc(cancel, options, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
self.compact_with_gc(cancel, options, ctx).await?;
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
|
||||
@@ -976,18 +1038,12 @@ impl Timeline {
|
||||
|
||||
// Suppress errors when cancelled.
|
||||
Err(_) if self.cancel.is_cancelled() => {}
|
||||
Err(CompactionError::ShuttingDown) => {}
|
||||
Err(CompactionError::CollectKeySpaceError(CollectKeySpaceError::Cancelled)) => {}
|
||||
Err(err) if err.is_cancel() => {}
|
||||
|
||||
// Alert on critical errors that indicate data corruption.
|
||||
Err(
|
||||
err @ CompactionError::CollectKeySpaceError(
|
||||
CollectKeySpaceError::Decode(_)
|
||||
| CollectKeySpaceError::PageRead(
|
||||
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
|
||||
),
|
||||
),
|
||||
) => critical!("could not compact, repartitioning keyspace failed: {err:?}"),
|
||||
Err(err) if err.is_critical() => {
|
||||
critical!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
}
|
||||
|
||||
// Log other errors. No partitioning? This is normal, if the timeline was just created
|
||||
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
|
||||
@@ -1161,7 +1217,7 @@ impl Timeline {
|
||||
// - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
|
||||
// - GC, which at worst witnesses us "undelete" a layer that they just deleted.
|
||||
// - ingestion, which only inserts layers, therefore cannot collide with us.
|
||||
let resident = layer.download_and_keep_resident().await?;
|
||||
let resident = layer.download_and_keep_resident(ctx).await?;
|
||||
|
||||
let keys_written = resident
|
||||
.filter(&self.shard_identity, &mut image_layer_writer, ctx)
|
||||
@@ -1389,14 +1445,14 @@ impl Timeline {
|
||||
|
||||
let mut fully_compacted = true;
|
||||
|
||||
deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
|
||||
deltas_to_compact.push(first_level0_delta.download_and_keep_resident(ctx).await?);
|
||||
for l in level0_deltas_iter {
|
||||
let lsn_range = &l.layer_desc().lsn_range;
|
||||
|
||||
if lsn_range.start != prev_lsn_end {
|
||||
break;
|
||||
}
|
||||
deltas_to_compact.push(l.download_and_keep_resident().await?);
|
||||
deltas_to_compact.push(l.download_and_keep_resident(ctx).await?);
|
||||
deltas_to_compact_bytes += l.metadata().file_size;
|
||||
prev_lsn_end = lsn_range.end;
|
||||
|
||||
@@ -2350,12 +2406,19 @@ impl Timeline {
|
||||
async fn check_compaction_space(
|
||||
self: &Arc<Self>,
|
||||
layer_selection: &[Layer],
|
||||
) -> anyhow::Result<()> {
|
||||
let available_space = self.check_available_space().await?;
|
||||
) -> Result<(), CompactionError> {
|
||||
let available_space = self
|
||||
.check_available_space()
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
let mut remote_layer_size = 0;
|
||||
let mut all_layer_size = 0;
|
||||
for layer in layer_selection {
|
||||
let needs_download = layer.needs_download().await?;
|
||||
let needs_download = layer
|
||||
.needs_download()
|
||||
.await
|
||||
.context("failed to check if layer needs download")
|
||||
.map_err(CompactionError::Other)?;
|
||||
if needs_download.is_some() {
|
||||
remote_layer_size += layer.layer_desc().file_size;
|
||||
}
|
||||
@@ -2364,14 +2427,14 @@ impl Timeline {
|
||||
let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
|
||||
if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
|
||||
{
|
||||
return Err(anyhow!(
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
|
||||
available_space,
|
||||
allocated_space,
|
||||
all_layer_size,
|
||||
remote_layer_size,
|
||||
all_layer_size + remote_layer_size
|
||||
));
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -2402,7 +2465,7 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
job: GcCompactJob,
|
||||
sub_compaction_max_job_size_mb: Option<u64>,
|
||||
) -> anyhow::Result<Vec<GcCompactJob>> {
|
||||
) -> Result<Vec<GcCompactJob>, CompactionError> {
|
||||
let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
|
||||
job.compact_lsn_range.end
|
||||
} else {
|
||||
@@ -2553,7 +2616,7 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let sub_compaction = options.sub_compaction;
|
||||
let job = GcCompactJob::from_compact_options(options.clone());
|
||||
if sub_compaction {
|
||||
@@ -2575,7 +2638,7 @@ impl Timeline {
|
||||
if jobs_len == 0 {
|
||||
info!("no jobs to run, skipping gc bottom-most compaction");
|
||||
}
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
self.compact_with_gc_inner(cancel, job, ctx).await
|
||||
}
|
||||
@@ -2585,19 +2648,24 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
job: GcCompactJob,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
|
||||
let timer = Instant::now();
|
||||
let begin_timer = timer;
|
||||
|
||||
let gc_lock = async {
|
||||
tokio::select! {
|
||||
guard = self.gc_lock.lock() => Ok(guard),
|
||||
// TODO: refactor to CompactionError to correctly pass cancelled error
|
||||
_ = cancel.cancelled() => Err(anyhow!("cancelled")),
|
||||
_ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
|
||||
}
|
||||
};
|
||||
|
||||
let time_acquire_lock = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
@@ -2649,7 +2717,7 @@ impl Timeline {
|
||||
tracing::warn!(
|
||||
"no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Skipped);
|
||||
}
|
||||
real_gc_cutoff
|
||||
} else {
|
||||
@@ -2687,7 +2755,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
|
||||
gc_cutoff
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
};
|
||||
// Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
|
||||
// the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
|
||||
@@ -2708,7 +2776,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
|
||||
compact_lsn_range.end
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
};
|
||||
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
|
||||
// layers to compact.
|
||||
@@ -2734,7 +2802,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
|
||||
gc_cutoff, compact_key_range.start, compact_key_range.end
|
||||
);
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
GcCompactionJobDescription {
|
||||
@@ -2787,6 +2855,9 @@ impl Timeline {
|
||||
has_data_below,
|
||||
);
|
||||
|
||||
let time_analyze = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
for layer in &job_desc.selected_layers {
|
||||
debug!("read layer: {}", layer.layer_desc().key());
|
||||
}
|
||||
@@ -2815,10 +2886,10 @@ impl Timeline {
|
||||
.map(|layer| layer.layer_desc().layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
bail!(
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
|
||||
err
|
||||
);
|
||||
)));
|
||||
}
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = job_desc
|
||||
@@ -2833,11 +2904,33 @@ impl Timeline {
|
||||
let mut total_downloaded_size = 0;
|
||||
let mut total_layer_size = 0;
|
||||
for layer in &job_desc.selected_layers {
|
||||
if layer.needs_download().await?.is_some() {
|
||||
if layer
|
||||
.needs_download()
|
||||
.await
|
||||
.context("failed to check if layer needs download")
|
||||
.map_err(CompactionError::Other)?
|
||||
.is_some()
|
||||
{
|
||||
total_downloaded_size += layer.layer_desc().file_size;
|
||||
}
|
||||
total_layer_size += layer.layer_desc().file_size;
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
let resident_layer = layer
|
||||
.download_and_keep_resident(ctx)
|
||||
.await
|
||||
.context("failed to download and keep resident layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
}
|
||||
info!(
|
||||
@@ -2848,19 +2941,36 @@ impl Timeline {
|
||||
);
|
||||
for resident_layer in &downloaded_layers {
|
||||
if resident_layer.layer_desc().is_delta() {
|
||||
let layer = resident_layer.get_as_delta(ctx).await?;
|
||||
let layer = resident_layer
|
||||
.get_as_delta(ctx)
|
||||
.await
|
||||
.context("failed to get delta layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
delta_layers.push(layer);
|
||||
} else {
|
||||
let layer = resident_layer.get_as_image(ctx).await?;
|
||||
let layer = resident_layer
|
||||
.get_as_image(ctx)
|
||||
.await
|
||||
.context("failed to get image layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
image_layers.push(layer);
|
||||
}
|
||||
}
|
||||
let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
|
||||
let (dense_ks, sparse_ks) = self
|
||||
.collect_gc_compaction_keyspace()
|
||||
.await
|
||||
.context("failed to collect gc compaction keyspace")
|
||||
.map_err(CompactionError::Other)?;
|
||||
let mut merge_iter = FilterIterator::create(
|
||||
MergeIterator::create(&delta_layers, &image_layers, ctx),
|
||||
dense_ks,
|
||||
sparse_ks,
|
||||
)?;
|
||||
)
|
||||
.context("failed to create filter iterator")
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
let time_download_layer = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
// Step 2: Produce images+deltas.
|
||||
let mut accumulated_values = Vec::new();
|
||||
@@ -2880,7 +2990,9 @@ impl Timeline {
|
||||
&self.gate,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
.await
|
||||
.context("failed to create image layer writer")
|
||||
.map_err(CompactionError::Other)?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
@@ -2893,7 +3005,9 @@ impl Timeline {
|
||||
lowest_retain_lsn..end_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RewritingLayers {
|
||||
@@ -2933,9 +3047,28 @@ impl Timeline {
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
|
||||
let mut keys_processed = 0;
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter
|
||||
.next_with_trace()
|
||||
.await
|
||||
.context("failed to get next key-value pair")
|
||||
.map_err(CompactionError::Other)?
|
||||
{
|
||||
if cancel.is_cancelled() {
|
||||
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
keys_processed += 1;
|
||||
if keys_processed % 1000 == 0 {
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
}
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
// If this shard does not need to store this key, simply skip it.
|
||||
@@ -2967,7 +3100,9 @@ impl Timeline {
|
||||
&self.gate,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
}
|
||||
rewriter.before.as_mut().unwrap()
|
||||
@@ -2983,14 +3118,20 @@ impl Timeline {
|
||||
&self.gate,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
.await
|
||||
.context("failed to create delta layer writer")
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
}
|
||||
rewriter.after.as_mut().unwrap()
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
rewriter.put_value(key, lsn, val, ctx).await?;
|
||||
rewriter
|
||||
.put_value(key, lsn, val, ctx)
|
||||
.await
|
||||
.context("failed to put value")
|
||||
.map_err(CompactionError::Other)?;
|
||||
continue;
|
||||
}
|
||||
match val {
|
||||
@@ -3013,9 +3154,13 @@ impl Timeline {
|
||||
&job_desc.retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
|
||||
.await?,
|
||||
.await
|
||||
.context("failed to get ancestor image")
|
||||
.map_err(CompactionError::Other)?,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to generate key retention")
|
||||
.map_err(CompactionError::Other)?;
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
@@ -3025,7 +3170,9 @@ impl Timeline {
|
||||
&self.gate,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to pipe to delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
accumulated_values.clear();
|
||||
*last_key = key;
|
||||
accumulated_values.push((key, lsn, val));
|
||||
@@ -3043,9 +3190,14 @@ impl Timeline {
|
||||
job_desc.gc_cutoff,
|
||||
&job_desc.retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
|
||||
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn)
|
||||
.await
|
||||
.context("failed to get ancestor image")
|
||||
.map_err(CompactionError::Other)?,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to generate key retention")
|
||||
.map_err(CompactionError::Other)?;
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
@@ -3055,21 +3207,36 @@ impl Timeline {
|
||||
&self.gate,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.context("failed to pipe to delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
// end: move the above part to the loop body
|
||||
|
||||
let time_main_loop = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
let mut rewrote_delta_layers = Vec::new();
|
||||
for (key, writers) in delta_layer_rewriters {
|
||||
if let Some(delta_writer_before) = writers.before {
|
||||
let (desc, path) = delta_writer_before
|
||||
.finish(job_desc.compaction_key_range.start, ctx)
|
||||
.await?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
.await
|
||||
.context("failed to finish delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.context("failed to finish creating delta layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
rewrote_delta_layers.push(layer);
|
||||
}
|
||||
if let Some(delta_writer_after) = writers.after {
|
||||
let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
let (desc, path) = delta_writer_after
|
||||
.finish(key.key_range.end, ctx)
|
||||
.await
|
||||
.context("failed to finish delta layer writer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.context("failed to finish creating delta layer")
|
||||
.map_err(CompactionError::Other)?;
|
||||
rewrote_delta_layers.push(layer);
|
||||
}
|
||||
}
|
||||
@@ -3084,7 +3251,9 @@ impl Timeline {
|
||||
let end_key = job_desc.compaction_key_range.end;
|
||||
writer
|
||||
.finish_with_discard_fn(self, ctx, end_key, discard)
|
||||
.await?
|
||||
.await
|
||||
.context("failed to finish image layer writer")
|
||||
.map_err(CompactionError::Other)?
|
||||
} else {
|
||||
drop(writer);
|
||||
Vec::new()
|
||||
@@ -3096,7 +3265,9 @@ impl Timeline {
|
||||
let produced_delta_layers = if !dry_run {
|
||||
delta_layer_writer
|
||||
.finish_with_discard_fn(self, ctx, discard)
|
||||
.await?
|
||||
.await
|
||||
.context("failed to finish delta layer writer")
|
||||
.map_err(CompactionError::Other)?
|
||||
} else {
|
||||
drop(delta_layer_writer);
|
||||
Vec::new()
|
||||
@@ -3108,6 +3279,13 @@ impl Timeline {
|
||||
let mut keep_layers = HashSet::new();
|
||||
let produced_delta_layers_len = produced_delta_layers.len();
|
||||
let produced_image_layers_len = produced_image_layers.len();
|
||||
|
||||
let layer_selection_by_key = job_desc
|
||||
.selected_layers
|
||||
.iter()
|
||||
.map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
for action in produced_delta_layers {
|
||||
match action {
|
||||
BatchWriterResult::Produced(layer) => {
|
||||
@@ -3121,8 +3299,16 @@ impl Timeline {
|
||||
if cfg!(debug_assertions) {
|
||||
info!("discarded delta layer: {}", l);
|
||||
}
|
||||
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
|
||||
stat.discard_delta_layer(layer_desc.file_size());
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
|
||||
l
|
||||
);
|
||||
stat.discard_delta_layer(0);
|
||||
}
|
||||
keep_layers.insert(l);
|
||||
stat.discard_delta_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3131,6 +3317,9 @@ impl Timeline {
|
||||
"produced rewritten delta layer: {}",
|
||||
layer.layer_desc().key()
|
||||
);
|
||||
// For now, we include rewritten delta layer size in the "produce_delta_layer". We could
|
||||
// make it a separate statistics in the future.
|
||||
stat.produce_delta_layer(layer.layer_desc().file_size());
|
||||
}
|
||||
compact_to.extend(rewrote_delta_layers);
|
||||
for action in produced_image_layers {
|
||||
@@ -3142,8 +3331,16 @@ impl Timeline {
|
||||
}
|
||||
BatchWriterResult::Discarded(l) => {
|
||||
debug!("discarded image layer: {}", l);
|
||||
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
|
||||
stat.discard_image_layer(layer_desc.file_size());
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
|
||||
l
|
||||
);
|
||||
stat.discard_image_layer(0);
|
||||
}
|
||||
keep_layers.insert(l);
|
||||
stat.discard_image_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3176,7 +3373,9 @@ impl Timeline {
|
||||
&layer.layer_desc().key_range,
|
||||
&job_desc.compaction_key_range,
|
||||
) {
|
||||
bail!("violated constraint: image layer outside of compaction key range");
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"violated constraint: image layer outside of compaction key range"
|
||||
)));
|
||||
}
|
||||
if !fully_contains(
|
||||
&job_desc.compaction_key_range,
|
||||
@@ -3189,13 +3388,25 @@ impl Timeline {
|
||||
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
|
||||
let time_final_phase = timer.elapsed();
|
||||
|
||||
stat.time_final_phase_secs = time_final_phase.as_secs_f64();
|
||||
stat.time_main_loop_secs = time_main_loop.as_secs_f64();
|
||||
stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
|
||||
stat.time_download_layer_secs = time_download_layer.as_secs_f64();
|
||||
stat.time_analyze_secs = time_analyze.as_secs_f64();
|
||||
stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
|
||||
stat.finalize();
|
||||
|
||||
info!(
|
||||
"gc-compaction statistics: {}",
|
||||
serde_json::to_string(&stat)?
|
||||
serde_json::to_string(&stat)
|
||||
.context("failed to serialize gc-compaction statistics")
|
||||
.map_err(CompactionError::Other)?
|
||||
);
|
||||
|
||||
if dry_run {
|
||||
return Ok(());
|
||||
return Ok(CompactionOutcome::Done);
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -3230,10 +3441,10 @@ impl Timeline {
|
||||
// the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
|
||||
// in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
|
||||
if let Some(err) = check_valid_layermap(&final_layers) {
|
||||
bail!(
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
|
||||
err
|
||||
);
|
||||
)));
|
||||
}
|
||||
|
||||
// Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
|
||||
@@ -3285,7 +3496,9 @@ impl Timeline {
|
||||
// find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
|
||||
// be batched into `schedule_compaction_update`.
|
||||
let disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
self.schedule_uploads(disk_consistent_lsn, None)?;
|
||||
self.schedule_uploads(disk_consistent_lsn, None)
|
||||
.context("failed to schedule uploads")
|
||||
.map_err(CompactionError::Other)?;
|
||||
// If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
|
||||
// of `compact_from`.
|
||||
let compact_from = {
|
||||
@@ -3312,7 +3525,7 @@ impl Timeline {
|
||||
|
||||
drop(gc_lock);
|
||||
|
||||
Ok(())
|
||||
Ok(CompactionOutcome::Done)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3418,6 +3631,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &OwnArc<PersistentLayerDesc>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<ResidentDeltaLayer>> {
|
||||
// this is a lot more complex than a simple downcast...
|
||||
if layer.is_delta() {
|
||||
@@ -3425,7 +3639,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
let guard = self.timeline.layers.read().await;
|
||||
guard.get_from_desc(layer)
|
||||
};
|
||||
let result = l.download_and_keep_resident().await?;
|
||||
let result = l.download_and_keep_resident(ctx).await?;
|
||||
|
||||
Ok(Some(ResidentDeltaLayer(result)))
|
||||
} else {
|
||||
|
||||
@@ -11,6 +11,7 @@ use utils::id::TimelineId;
|
||||
use utils::{crashsafe, fs_ext, pausable_failpoint};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
@@ -291,10 +292,11 @@ impl DeleteTimelineFlow {
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: &TimelineMetadata,
|
||||
remote_client: RemoteTimelineClient,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||
// RemoteTimelineClient is the only functioning part.
|
||||
let timeline = tenant
|
||||
let (timeline, _timeline_ctx) = tenant
|
||||
.create_timeline_struct(
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
@@ -306,6 +308,8 @@ impl DeleteTimelineFlow {
|
||||
CreateTimelineCause::Delete,
|
||||
crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
|
||||
None, // doesn't matter what we put here
|
||||
None, // doesn't matter what we put here
|
||||
ctx,
|
||||
)
|
||||
.context("create_timeline_struct")?;
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ use utils::completion;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateError;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{FlushLayerError, Timeline};
|
||||
@@ -363,14 +364,25 @@ pub(super) async fn prepare(
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
|
||||
let cancel_eval = CancellationToken::new();
|
||||
|
||||
for adopted in rest_of_historic {
|
||||
let limiter = limiter.clone();
|
||||
let timeline = detached.clone();
|
||||
let cancel_eval = cancel_eval.clone();
|
||||
|
||||
tasks.spawn(
|
||||
async move {
|
||||
let _permit = limiter.acquire().await;
|
||||
let _permit = tokio::select! {
|
||||
permit = limiter.acquire() => {
|
||||
permit
|
||||
}
|
||||
// Wait for the cancellation here instead of letting the entire task be cancelled.
|
||||
// Cancellations are racy in that they might leave layers on disk.
|
||||
_ = cancel_eval.cancelled() => {
|
||||
Err(Error::ShuttingDown)?
|
||||
}
|
||||
};
|
||||
let (owned, did_hardlink) = remote_copy(
|
||||
&adopted,
|
||||
&timeline,
|
||||
@@ -386,7 +398,22 @@ pub(super) async fn prepare(
|
||||
);
|
||||
}
|
||||
|
||||
fn delete_layers(timeline: &Timeline, layers: Vec<Layer>) -> Result<(), Error> {
|
||||
// We are deleting layers, so we must hold the gate
|
||||
let _gate = timeline.gate.enter().map_err(|e| match e {
|
||||
GateError::GateClosed => Error::ShuttingDown,
|
||||
})?;
|
||||
{
|
||||
layers.into_iter().for_each(|l: Layer| {
|
||||
l.delete_on_drop();
|
||||
std::mem::drop(l);
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
let mut should_fsync = false;
|
||||
let mut first_err = None;
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
match res {
|
||||
Ok(Ok((owned, did_hardlink))) => {
|
||||
@@ -395,13 +422,24 @@ pub(super) async fn prepare(
|
||||
}
|
||||
new_layers.push(owned);
|
||||
}
|
||||
|
||||
// Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete.
|
||||
Ok(Err(failed)) => {
|
||||
return Err(failed);
|
||||
cancel_eval.cancel();
|
||||
first_err.get_or_insert(failed);
|
||||
}
|
||||
Err(je) => {
|
||||
cancel_eval.cancel();
|
||||
first_err.get_or_insert(Error::Prepare(je.into()));
|
||||
}
|
||||
Err(je) => return Err(Error::Prepare(je.into())),
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(failed) = first_err {
|
||||
delete_layers(detached, new_layers)?;
|
||||
return Err(failed);
|
||||
}
|
||||
|
||||
// fsync directory again if we hardlinked something
|
||||
if should_fsync {
|
||||
fsync_timeline_dir(detached, ctx).await;
|
||||
@@ -592,7 +630,7 @@ async fn copy_lsn_prefix(
|
||||
.with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}"))
|
||||
.map_err(Error::Prepare)?;
|
||||
|
||||
let resident = layer.download_and_keep_resident().await.map_err(|e| {
|
||||
let resident = layer.download_and_keep_resident(ctx).await.map_err(|e| {
|
||||
if e.is_cancelled() {
|
||||
Error::ShuttingDown
|
||||
} else {
|
||||
@@ -650,6 +688,11 @@ async fn remote_copy(
|
||||
let conf = adoptee.conf;
|
||||
let file_name = adopted.layer_desc().layer_name();
|
||||
|
||||
// We don't want to shut the timeline down during this operation because we do `delete_on_drop` below
|
||||
let _gate = adoptee.gate.enter().map_err(|e| match e {
|
||||
GateError::GateClosed => Error::ShuttingDown,
|
||||
})?;
|
||||
|
||||
// depending if Layer::keep_resident, do a hardlink
|
||||
let did_hardlink;
|
||||
let owned = if let Some(adopted_resident) = adopted.keep_resident().await {
|
||||
@@ -661,8 +704,32 @@ async fn remote_copy(
|
||||
&file_name,
|
||||
&metadata.generation,
|
||||
);
|
||||
std::fs::hard_link(adopted_path, &adoptee_path)
|
||||
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
|
||||
|
||||
match std::fs::hard_link(adopted_path, &adoptee_path) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
|
||||
// In theory we should not get into this situation as we are doing cleanups of the layer file after errors.
|
||||
// However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch.
|
||||
|
||||
// Double check that the file is orphan (probably from an earlier attempt), then delete it
|
||||
let key = file_name.clone().into();
|
||||
if adoptee.layers.read().await.contains_key(&key) {
|
||||
// We are supposed to filter out such cases before coming to this function
|
||||
return Err(Error::Prepare(anyhow::anyhow!(
|
||||
"layer file {file_name} already present and inside layer map"
|
||||
)));
|
||||
}
|
||||
tracing::info!("Deleting orphan layer file to make way for hard linking");
|
||||
// Delete orphan layer file and try again, to ensure this layer has a well understood source
|
||||
std::fs::remove_file(adopted_path)
|
||||
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
|
||||
std::fs::hard_link(adopted_path, &adoptee_path)
|
||||
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(Error::launder(e.into(), Error::Prepare));
|
||||
}
|
||||
};
|
||||
did_hardlink = true;
|
||||
Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard()
|
||||
} else {
|
||||
@@ -670,12 +737,21 @@ async fn remote_copy(
|
||||
Layer::for_evicted(conf, adoptee, file_name, metadata)
|
||||
};
|
||||
|
||||
let layer = adoptee
|
||||
let layer = match adoptee
|
||||
.remote_client
|
||||
.copy_timeline_layer(adopted, &owned, cancel)
|
||||
.await
|
||||
.map(move |()| owned)
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
{
|
||||
Ok(()) => owned,
|
||||
Err(e) => {
|
||||
{
|
||||
// Clean up the layer so that on a retry we don't get errors that the file already exists
|
||||
owned.delete_on_drop();
|
||||
std::mem::drop(owned);
|
||||
}
|
||||
return Err(Error::launder(e, Error::Prepare));
|
||||
}
|
||||
};
|
||||
|
||||
Ok((layer, did_hardlink))
|
||||
}
|
||||
|
||||
@@ -93,7 +93,8 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
|
||||
let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn)
|
||||
.with_scope_timeline(&self);
|
||||
loop {
|
||||
let policy = self.get_eviction_policy();
|
||||
let cf = self
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
//! An efficient way to keep the timeline gate open without preventing
|
||||
//! timeline shutdown for longer than a single call to a timeline method.
|
||||
//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
|
||||
//!
|
||||
//! # Motivation
|
||||
//!
|
||||
@@ -19,27 +18,32 @@
|
||||
//! we hold the Timeline gate open while we're invoking the method on the
|
||||
//! Timeline object.
|
||||
//!
|
||||
//! However, we want to avoid the overhead of entering the gate for every
|
||||
//! method invocation.
|
||||
//!
|
||||
//! Further, for shard routing, we want to avoid calling the tenant manager to
|
||||
//! resolve the shard for every request. Instead, we want to cache the
|
||||
//! routing result so we can bypass the tenant manager for all subsequent requests
|
||||
//! that get routed to that shard.
|
||||
//! We want to avoid the overhead of doing, for each incoming request,
|
||||
//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
|
||||
//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
|
||||
//! release the mgr rwlock before doing any request processing work
|
||||
//! - re-entering the Timeline gate for each Timeline method invocation.
|
||||
//!
|
||||
//! Regardless of how we accomplish the above, it should not
|
||||
//! prevent the Timeline from shutting down promptly.
|
||||
//!
|
||||
//!
|
||||
//! # Design
|
||||
//!
|
||||
//! ## Data Structures
|
||||
//!
|
||||
//! There are three user-facing data structures:
|
||||
//! There are two concepts expressed as associated types in the `Types` trait:
|
||||
//! - `TenantManager`: the thing that performs the expensive work. It produces
|
||||
//! a `Timeline` object, which is the other associated type.
|
||||
//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
|
||||
//!
|
||||
//! There are three user-facing data structures exposed by this module:
|
||||
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||
//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
|
||||
//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
|
||||
//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
|
||||
//! trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
|
||||
//! point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
|
||||
//!
|
||||
//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
|
||||
//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
|
||||
@@ -64,11 +68,14 @@
|
||||
//!
|
||||
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||
//!
|
||||
//! A cache miss means we consult the tenant manager for shard routing,
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
|
||||
//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
|
||||
//! A cache miss means we call Types::TenantManager::resolve for shard routing,
|
||||
//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
|
||||
//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
|
||||
//!
|
||||
//! We wrap the object returned from resolve() in an `Arc` and store that inside the
|
||||
//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
|
||||
//! and a strong ref in the `PerTimelineState`.
|
||||
//! A strong ref is returned wrapped in a `Handle`.
|
||||
//! Another strong ref is returned wrapped in a `Handle`.
|
||||
//!
|
||||
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||
//! and find the weak ref in the cache.
|
||||
@@ -78,51 +85,51 @@
|
||||
//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
|
||||
//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
|
||||
//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||
//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
|
||||
//!
|
||||
//! # Performance
|
||||
//!
|
||||
//! Remember from the introductory section:
|
||||
//!
|
||||
//! > However, we want to avoid the overhead of entering the gate for every
|
||||
//! > method invocation.
|
||||
//! > We want to avoid the overhead of doing, for each incoming request,
|
||||
//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
|
||||
//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
|
||||
//! > release the mgr rwlock before doing any request processing work
|
||||
//! > - re-entering the Timeline gate for each Timeline method invocation.
|
||||
//!
|
||||
//! Why do we want to avoid that?
|
||||
//! Because the gate is a shared location in memory and entering it involves
|
||||
//! bumping refcounts, which leads to cache contention if done frequently
|
||||
//! from multiple cores in parallel.
|
||||
//! All of these boil down to some state that is either globally shared among all shards
|
||||
//! or state shared among all tasks that serve a particular timeline.
|
||||
//! It is either protected by RwLock or manipulated via atomics.
|
||||
//! Even atomics are costly when shared across multiple cores.
|
||||
//! So, we want to avoid any permanent need for coordination between page_service tasks.
|
||||
//!
|
||||
//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
|
||||
//! That `Arc` is private to the `HandleInner` and hence to the connection.
|
||||
//! The solution is to add indirection: we wrap the Types::Timeline object that is
|
||||
//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
|
||||
//! and hence to the single Cache / page_service connection.
|
||||
//! (Review the "Data Structures" section if that is unclear to you.)
|
||||
//!
|
||||
//! A `WeakHandle` is a weak ref to the `HandleInner`.
|
||||
//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
|
||||
//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
|
||||
//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
|
||||
//!
|
||||
//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
|
||||
//! Again, this is cheap because the `Arc` is private to the connection.
|
||||
//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
|
||||
//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
|
||||
//! The Mutex is not contended because it is private to the connection.
|
||||
//! And again, the `Arc<Types::Timeline>` clone is cheap because that wrapper
|
||||
//! Arc's refcounts are private to the connection.
|
||||
//!
|
||||
//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
|
||||
//!
|
||||
//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
|
||||
//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
|
||||
//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
|
||||
//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
|
||||
//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
|
||||
//! so that we can clone it cheaply when upgrading a `WeakHandle`.
|
||||
//!
|
||||
//! # Shutdown
|
||||
//!
|
||||
//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! Further, there is this cycle:
|
||||
//!
|
||||
//! ```text
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
|
||||
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
|
||||
//! ```
|
||||
//!
|
||||
//! The former cycle is a memory leak if not broken.
|
||||
@@ -135,9 +142,12 @@
|
||||
//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
|
||||
//! - Connection shutdown (=> dropping the `Cache`).
|
||||
//!
|
||||
//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
|
||||
//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
|
||||
//! `Arc<GateGuard>`.
|
||||
//! Both transition the `HandleInner` from [`HandleInner::Open`] to
|
||||
//! [`HandleInner::ShutDown`], which drops the only long-lived
|
||||
//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
|
||||
//! is dropped, the `Types::Timeline` gets dropped and thereby
|
||||
//! the `GateGuard` and the `Arc<Timeline>` that it stores,
|
||||
//! thereby breaking both cycles.
|
||||
//!
|
||||
//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
|
||||
//! thereby breaking the cycle.
|
||||
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
|
||||
pub(crate) trait Types: Sized + std::fmt::Debug {
|
||||
type TenantManagerError: Sized + std::fmt::Debug;
|
||||
type TenantManager: TenantManager<Self> + Sized;
|
||||
type Timeline: ArcTimeline<Self> + Sized;
|
||||
type Timeline: Timeline<Self> + Sized;
|
||||
}
|
||||
|
||||
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
|
||||
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Handle<T: Types> {
|
||||
timeline: Arc<T::Timeline>,
|
||||
#[allow(dead_code)] // the field exists to keep the gate open
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
inner: Arc<Mutex<HandleInner<T>>>,
|
||||
open: Arc<T::Timeline>,
|
||||
}
|
||||
pub(crate) struct WeakHandle<T: Types> {
|
||||
inner: Weak<Mutex<HandleInner<T>>>,
|
||||
}
|
||||
|
||||
enum HandleInner<T: Types> {
|
||||
KeepingTimelineGateOpen {
|
||||
#[allow(dead_code)]
|
||||
gate_guard: Arc<utils::sync::gate::GateGuard>,
|
||||
timeline: Arc<T::Timeline>,
|
||||
},
|
||||
Open(Arc<T::Timeline>),
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
|
||||
}
|
||||
|
||||
/// Abstract view of an [`Arc<Timeline>`], for testability.
|
||||
pub(crate) trait ArcTimeline<T: Types>: Clone {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate;
|
||||
pub(crate) trait Timeline<T: Types> {
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId;
|
||||
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<T>;
|
||||
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum GetError<T: Types> {
|
||||
TenantManager(T::TenantManagerError),
|
||||
TimelineGateClosed,
|
||||
PerTimelineStateShutDown,
|
||||
}
|
||||
|
||||
@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
|
||||
}
|
||||
|
||||
trace!("creating new HandleInner");
|
||||
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
|
||||
gate_guard: Arc::new(
|
||||
// this enter() is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::gate refcounts
|
||||
match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
},
|
||||
),
|
||||
// this clone is expensive in production code because
|
||||
// it hits the global Arc<Timeline>::clone refcounts
|
||||
timeline: Arc::new(timeline.clone()),
|
||||
}));
|
||||
let timeline = Arc::new(timeline);
|
||||
let handle_inner_arc =
|
||||
Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
|
||||
let handle_weak = WeakHandle {
|
||||
inner: Arc::downgrade(&handle_inner_arc),
|
||||
};
|
||||
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
|
||||
};
|
||||
let lock_guard = inner.lock().expect("poisoned");
|
||||
match &*lock_guard {
|
||||
HandleInner::KeepingTimelineGateOpen {
|
||||
timeline,
|
||||
gate_guard,
|
||||
} => {
|
||||
let gate_guard = Arc::clone(gate_guard);
|
||||
let timeline = Arc::clone(timeline);
|
||||
HandleInner::Open(open) => {
|
||||
let open = Arc::clone(open);
|
||||
drop(lock_guard);
|
||||
Ok(Handle {
|
||||
timeline,
|
||||
gate_guard,
|
||||
inner,
|
||||
})
|
||||
Ok(Handle { open, inner })
|
||||
}
|
||||
HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
|
||||
}
|
||||
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
&self.open
|
||||
}
|
||||
}
|
||||
|
||||
@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
|
||||
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||
/// Even if [`TenantManager::resolve`] would still resolve to it.
|
||||
///
|
||||
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
|
||||
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
|
||||
/// That's ok because they're short-lived. See module-level comment for details.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(super) fn shutdown(&self) {
|
||||
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
|
||||
impl<T: Types> HandleInner<T> {
|
||||
fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
|
||||
match std::mem::replace(self, HandleInner::ShutDown) {
|
||||
HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
|
||||
HandleInner::Open(timeline) => Some(timeline),
|
||||
HandleInner::ShutDown => {
|
||||
// Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
|
||||
// may do it concurrently, but locking rules disallow holding per-timeline-state lock and
|
||||
@@ -631,6 +614,7 @@ mod tests {
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use utils::shard::ShardCount;
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -641,7 +625,7 @@ mod tests {
|
||||
impl Types for TestTypes {
|
||||
type TenantManagerError = anyhow::Error;
|
||||
type TenantManager = StubManager;
|
||||
type Timeline = Arc<StubTimeline>;
|
||||
type Timeline = Entered;
|
||||
}
|
||||
|
||||
struct StubManager {
|
||||
@@ -656,17 +640,19 @@ mod tests {
|
||||
myself: Weak<StubTimeline>,
|
||||
}
|
||||
|
||||
struct Entered {
|
||||
timeline: Arc<StubTimeline>,
|
||||
#[allow(dead_code)] // it's stored here to keep the gate open
|
||||
gate_guard: Arc<GateGuard>,
|
||||
}
|
||||
|
||||
impl StubTimeline {
|
||||
fn getpage(&self) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate {
|
||||
&self.gate
|
||||
}
|
||||
|
||||
impl Timeline<TestTypes> for Entered {
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||
ShardTimelineId {
|
||||
shard_index: self.shard.shard_index(),
|
||||
@@ -688,20 +674,34 @@ mod tests {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> anyhow::Result<Arc<StubTimeline>> {
|
||||
) -> anyhow::Result<Entered> {
|
||||
for timeline in &self.shards {
|
||||
if timeline.id == timeline_id {
|
||||
let enter_gate = || {
|
||||
let gate_guard = timeline.gate.enter()?;
|
||||
let gate_guard = Arc::new(gate_guard);
|
||||
anyhow::Ok(gate_guard)
|
||||
};
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Zero => continue,
|
||||
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Page(_) => continue,
|
||||
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
return Ok(Entered {
|
||||
timeline: Arc::clone(timeline),
|
||||
gate_guard: enter_gate()?,
|
||||
});
|
||||
}
|
||||
ShardSelector::Known(_) => continue,
|
||||
}
|
||||
@@ -711,6 +711,13 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Entered {
|
||||
type Target = StubTimeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.timeline
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_timeline_shutdown() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
@@ -1038,7 +1045,6 @@ mod tests {
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
// Simulate 10 connections that's opened, used, and closed
|
||||
let mut used_handles = vec![];
|
||||
for _ in 0..10 {
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
let handle = {
|
||||
@@ -1050,7 +1056,6 @@ mod tests {
|
||||
handle
|
||||
};
|
||||
handle.getpage();
|
||||
used_handles.push(Arc::downgrade(&handle.timeline));
|
||||
}
|
||||
|
||||
// No handles exist, thus gates are closed and don't require shutdown.
|
||||
|
||||
@@ -10,6 +10,8 @@ use http_utils::error::ApiError;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::sync::gate::Gate;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
// This status is not strictly necessary now, but gives us a nice place
|
||||
@@ -30,6 +32,8 @@ impl HeatmapLayersDownloader {
|
||||
fn new(
|
||||
timeline: Arc<Timeline>,
|
||||
concurrency: usize,
|
||||
recurse: bool,
|
||||
ctx: RequestContext,
|
||||
) -> Result<HeatmapLayersDownloader, ApiError> {
|
||||
let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;
|
||||
|
||||
@@ -57,12 +61,13 @@ impl HeatmapLayersDownloader {
|
||||
|
||||
tracing::info!(
|
||||
resident_size=%timeline.resident_physical_size(),
|
||||
heatmap_layers=%heatmap.layers.len(),
|
||||
heatmap_layers=%heatmap.all_layers().count(),
|
||||
"Starting heatmap layers download"
|
||||
);
|
||||
|
||||
let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
|
||||
let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
|
||||
|layer| {
|
||||
let ctx = ctx.attached_child();
|
||||
let tl = timeline.clone();
|
||||
let dl_guard = match downloads_guard.enter() {
|
||||
Ok(g) => g,
|
||||
@@ -75,7 +80,7 @@ impl HeatmapLayersDownloader {
|
||||
Some(async move {
|
||||
let _dl_guard = dl_guard;
|
||||
|
||||
let res = tl.download_layer(&layer.name).await;
|
||||
let res = tl.download_layer(&layer.name, &ctx).await;
|
||||
if let Err(err) = res {
|
||||
if !err.is_cancelled() {
|
||||
tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}")
|
||||
@@ -94,6 +99,20 @@ impl HeatmapLayersDownloader {
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("Heatmap layers download cancelled");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if recurse {
|
||||
if let Some(ancestor) = timeline.ancestor_timeline() {
|
||||
let ctx = ctx.attached_child();
|
||||
let res =
|
||||
ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx);
|
||||
if let Err(err) = res {
|
||||
tracing::info!(
|
||||
"Failed to start heatmap layers download for ancestor: {err}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -136,13 +155,20 @@ impl HeatmapLayersDownloader {
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub(crate) async fn start_heatmap_layers_download(
|
||||
pub(crate) fn start_heatmap_layers_download(
|
||||
self: &Arc<Self>,
|
||||
concurrency: usize,
|
||||
recurse: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), ApiError> {
|
||||
let mut locked = self.heatmap_layers_downloader.lock().unwrap();
|
||||
if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
|
||||
let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?;
|
||||
let dl = HeatmapLayersDownloader::new(
|
||||
self.clone(),
|
||||
concurrency,
|
||||
recurse,
|
||||
ctx.attached_child(),
|
||||
)?;
|
||||
*locked = Some(dl);
|
||||
Ok(())
|
||||
} else {
|
||||
|
||||
@@ -8,14 +8,14 @@ use tracing::trace;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::{AtomicLsn, Lsn};
|
||||
|
||||
use super::TimelineWriterState;
|
||||
use super::{ReadableLayer, TimelineWriterState};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::TimelineMetrics;
|
||||
use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
|
||||
PersistentLayerKey, ResidentLayer,
|
||||
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
|
||||
};
|
||||
|
||||
/// Provides semantic APIs to manipulate the layer map.
|
||||
@@ -37,6 +37,21 @@ impl Default for LayerManager {
|
||||
}
|
||||
|
||||
impl LayerManager {
|
||||
pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
|
||||
match weak {
|
||||
ReadableLayerWeak::PersistentLayer(desc) => {
|
||||
ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
|
||||
}
|
||||
ReadableLayerWeak::InMemoryLayer(desc) => {
|
||||
let inmem = self
|
||||
.layer_map()
|
||||
.expect("no concurrent shutdown")
|
||||
.in_memory_layer(&desc);
|
||||
ReadableLayer::InMemoryLayer(inmem)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
@@ -470,6 +485,25 @@ impl OpenLayerManager {
|
||||
mapping.remove(layer);
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
|
||||
match layer.info() {
|
||||
InMemoryLayerInfo::Open { .. } => {
|
||||
assert!(self.layer_map.open_layer.is_none());
|
||||
self.layer_map.open_layer = Some(layer);
|
||||
}
|
||||
InMemoryLayerInfo::Frozen { lsn_start, .. } => {
|
||||
if let Some(last) = self.layer_map.frozen_layers.back() {
|
||||
assert!(last.get_lsn_range().end <= lsn_start);
|
||||
}
|
||||
|
||||
self.layer_map.frozen_layers.push_back(layer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||
|
||||
@@ -961,7 +961,8 @@ mod tests {
|
||||
}
|
||||
|
||||
async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
let (_temp_dir, pathbuf, offsets) =
|
||||
write_maybe_compressed(blobs, compression, &ctx).await?;
|
||||
|
||||
|
||||
@@ -26,15 +26,14 @@ use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
|
||||
use owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
pub use pageserver_api::models::virtual_file as api;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tokio::time::Instant;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
|
||||
|
||||
use crate::assert_u64_eq_usize::UsizeIsU64;
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation};
|
||||
use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
|
||||
use crate::page_cache::{PAGE_SZ, PageWriteGuard};
|
||||
use crate::tenant::TENANTS_SEGMENT_NAME;
|
||||
pub(crate) mod io_engine;
|
||||
pub use io_engine::{
|
||||
FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test,
|
||||
@@ -121,7 +120,7 @@ impl VirtualFile {
|
||||
pub async fn open_with_options<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
open_options: &OpenOptions,
|
||||
ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
|
||||
Ok(VirtualFile {
|
||||
@@ -133,7 +132,7 @@ impl VirtualFile {
|
||||
pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
open_options: &OpenOptions,
|
||||
ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Self, std::io::Error> {
|
||||
let file = match get_io_mode() {
|
||||
IoMode::Buffered => {
|
||||
@@ -300,13 +299,6 @@ pub struct VirtualFileInner {
|
||||
/// storing it here.
|
||||
pub path: Utf8PathBuf,
|
||||
open_options: OpenOptions,
|
||||
|
||||
// These are strings becase we only use them for metrics, and those expect strings.
|
||||
// It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
|
||||
// strings.
|
||||
tenant_id: String,
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone, Copy)]
|
||||
@@ -588,36 +580,16 @@ impl VirtualFileInner {
|
||||
pub async fn open_with_options<P: AsRef<Utf8Path>>(
|
||||
path: P,
|
||||
open_options: &OpenOptions,
|
||||
_ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<VirtualFileInner, std::io::Error> {
|
||||
let path_ref = path.as_ref();
|
||||
let path_str = path_ref.to_string();
|
||||
let parts = path_str.split('/').collect::<Vec<&str>>();
|
||||
let (tenant_id, shard_id, timeline_id) =
|
||||
if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
|
||||
let tenant_shard_part = parts[parts.len() - 4];
|
||||
let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
|
||||
Ok(tenant_shard_id) => (
|
||||
tenant_shard_id.tenant_id.to_string(),
|
||||
format!("{}", tenant_shard_id.shard_slug()),
|
||||
),
|
||||
Err(_) => {
|
||||
// Malformed path: this ID is just for observability, so tolerate it
|
||||
// and pass through
|
||||
(tenant_shard_part.to_string(), "*".to_string())
|
||||
}
|
||||
};
|
||||
(tenant_id, shard_id, parts[parts.len() - 2].to_string())
|
||||
} else {
|
||||
("*".to_string(), "*".to_string(), "*".to_string())
|
||||
};
|
||||
let path = path.as_ref();
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
|
||||
|
||||
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
|
||||
// where our caller doesn't get to use the returned VirtualFile before its
|
||||
// slot gets re-used by someone else.
|
||||
let file = observe_duration!(StorageIoOperation::Open, {
|
||||
open_options.open(path_ref.as_std_path()).await?
|
||||
open_options.open(path.as_std_path()).await?
|
||||
});
|
||||
|
||||
// Strip all options other than read and write.
|
||||
@@ -633,11 +605,8 @@ impl VirtualFileInner {
|
||||
let vfile = VirtualFileInner {
|
||||
handle: RwLock::new(handle),
|
||||
pos: 0,
|
||||
path: path_ref.to_path_buf(),
|
||||
path: path.to_owned(),
|
||||
open_options: reopen_options,
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
// TODO: Under pressure, it's likely the slot will get re-used and
|
||||
@@ -934,7 +903,7 @@ impl VirtualFileInner {
|
||||
&self,
|
||||
buf: tokio_epoll_uring::Slice<Buf>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
ctx: &RequestContext,
|
||||
) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
|
||||
where
|
||||
Buf: tokio_epoll_uring::IoBufMut + Send,
|
||||
@@ -952,14 +921,7 @@ impl VirtualFileInner {
|
||||
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
|
||||
let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
|
||||
if let Ok(size) = res {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&[
|
||||
"read",
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
])
|
||||
.add(size as i64);
|
||||
ctx.io_size_metrics().read.add(size.into_u64());
|
||||
}
|
||||
(buf, res)
|
||||
})
|
||||
@@ -970,9 +932,9 @@ impl VirtualFileInner {
|
||||
&self,
|
||||
buf: FullSlice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<B>, Result<usize, Error>) {
|
||||
let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
|
||||
let (slice, result) = self.write_at_inner(buf, offset, ctx).await;
|
||||
let result = result.maybe_fatal_err("write_at");
|
||||
(slice, result)
|
||||
}
|
||||
@@ -981,7 +943,7 @@ impl VirtualFileInner {
|
||||
&self,
|
||||
buf: FullSlice<B>,
|
||||
offset: u64,
|
||||
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<B>, Result<usize, Error>) {
|
||||
let file_guard = match self.lock_file().await {
|
||||
Ok(file_guard) => file_guard,
|
||||
@@ -991,14 +953,7 @@ impl VirtualFileInner {
|
||||
let ((_file_guard, buf), result) =
|
||||
io_engine::get().write_at(file_guard, offset, buf).await;
|
||||
if let Ok(size) = result {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&[
|
||||
"write",
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
&self.timeline_id,
|
||||
])
|
||||
.add(size as i64);
|
||||
ctx.io_size_metrics().write.add(size.into_u64());
|
||||
}
|
||||
(buf, result)
|
||||
})
|
||||
@@ -1584,7 +1539,8 @@ mod tests {
|
||||
where
|
||||
A: Adapter,
|
||||
{
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir(testname);
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
@@ -1711,7 +1667,8 @@ mod tests {
|
||||
const THREADS: usize = 100;
|
||||
const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
@@ -1770,7 +1727,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_atomic_overwrite_basic() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
|
||||
std::fs::create_dir_all(&testdir).unwrap();
|
||||
|
||||
@@ -1798,7 +1756,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_atomic_overwrite_preexisting_tmp() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let ctx =
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
|
||||
let testdir =
|
||||
crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
|
||||
std::fs::create_dir_all(&testdir).unwrap();
|
||||
|
||||
@@ -181,7 +181,8 @@ where
|
||||
Err(self
|
||||
.shutdown()
|
||||
.await
|
||||
.expect_err("flush task only disconnects duplex if it exits with an error"))
|
||||
.err()
|
||||
.expect("flush task only disconnects duplex if it exits with an error"))
|
||||
}
|
||||
|
||||
/// Cleans up the channel, join the flush task.
|
||||
|
||||
@@ -136,7 +136,9 @@ impl WalRedoProcess {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
if !output.contains("LOG:") {
|
||||
error!(%output, "received output");
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
|
||||
Reference in New Issue
Block a user