Merge WITH CONFLICTS 2025-03-11 main commit '7c462b3417ecd3ae3907f3480f3b8a8c99fc6d7b' into yuchen/dire

ct-io-delta-image-layer-write

Conflicts:
	pageserver/src/tenant/blob_io.rs
This commit is contained in:
Christian Schwarz
2025-04-09 19:39:12 +02:00
380 changed files with 13238 additions and 5308 deletions

View File

@@ -33,8 +33,9 @@ use utils::lsn::Lsn;
use crate::context::RequestContext;
use crate::pgdatadir_mapping::Version;
use crate::tenant::Timeline;
use crate::tenant::storage_layer::IoConcurrency;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::{PageReconstructError, Timeline};
#[derive(Debug, thiserror::Error)]
pub enum BasebackupError {
@@ -42,6 +43,26 @@ pub enum BasebackupError {
Server(#[from] anyhow::Error),
#[error("basebackup client error {0:#} when {1}")]
Client(#[source] io::Error, &'static str),
#[error("basebackup during shutdown")]
Shutdown,
}
impl From<PageReconstructError> for BasebackupError {
fn from(value: PageReconstructError) -> Self {
match value {
PageReconstructError::Cancelled => BasebackupError::Shutdown,
err => BasebackupError::Server(err.into()),
}
}
}
impl From<GetVectoredError> for BasebackupError {
fn from(value: GetVectoredError) -> Self {
match value {
GetVectoredError::Cancelled => BasebackupError::Shutdown,
err => BasebackupError::Server(err.into()),
}
}
}
/// Create basebackup with non-rel data in it.
@@ -127,7 +148,7 @@ where
timeline
.gate
.enter()
.map_err(|e| BasebackupError::Server(e.into()))?,
.map_err(|_| BasebackupError::Shutdown)?,
),
};
basebackup
@@ -323,8 +344,7 @@ where
let slru_partitions = self
.timeline
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
.partition(
self.timeline.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -336,11 +356,10 @@ where
let blocks = self
.timeline
.get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
for (key, block) in blocks {
let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
let block = block?;
slru_builder.add_block(&key, block).await?;
}
}
@@ -349,11 +368,8 @@ where
let mut min_restart_lsn: Lsn = Lsn::MAX;
// Create tablespace directories
for ((spcnode, dbnode), has_relmap_file) in self
.timeline
.list_dbdirs(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
for ((spcnode, dbnode), has_relmap_file) in
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
{
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
@@ -362,8 +378,7 @@ where
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
// contents of UNLOGGED relations. Postgres copies it in
@@ -391,8 +406,7 @@ where
let aux_files = self
.timeline
.list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
let aux_scan_time = start_time.elapsed();
let aux_estimated_size = aux_files
.values()
@@ -451,16 +465,14 @@ where
for xid in self
.timeline
.list_twophase_files(self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
{
self.add_twophase_file(xid).await?;
}
let repl_origins = self
.timeline
.get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
let n_origins = repl_origins.len();
if n_origins != 0 {
//
@@ -505,8 +517,7 @@ where
let nblocks = self
.timeline
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
@@ -532,8 +543,7 @@ where
// TODO: investigate using get_vectored for the entire startblk..endblk range.
// But this code path is not on the critical path for most basebackups (?).
.get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
segment_data.extend_from_slice(&img[..]);
}
@@ -567,8 +577,7 @@ where
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
if img.len()
!= dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
@@ -622,8 +631,7 @@ where
&& self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?
.await?
.is_empty()
{
return Ok(());
@@ -674,8 +682,7 @@ where
let img = self
.timeline
.get_twophase_file(xid, self.lsn, self.ctx)
.await
.map_err(|e| BasebackupError::Server(e.into()))?;
.await?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);

View File

@@ -14,6 +14,7 @@ use camino::Utf8Path;
use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
use metrics::set_build_info_metric;
use nix::sys::socket::{setsockopt, sockopt};
use pageserver::config::{PageServerConf, PageserverIdentity};
use pageserver::controller_upcall_client::ControllerUpcallClient;
use pageserver::deletion_queue::DeletionQueue;
@@ -24,11 +25,12 @@ use pageserver::task_mgr::{
};
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
use pageserver::{
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service,
task_mgr, virtual_file,
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
page_cache, page_service, task_mgr, virtual_file,
};
use postgres_backend::AuthType;
use remote_storage::GenericRemoteStorage;
use rustls_pki_types::{CertificateDer, PrivateKeyDer};
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
@@ -342,11 +344,25 @@ fn start_pageserver(
info!("Starting pageserver http handler on {http_addr}");
let http_listener = tcp_listener::bind(http_addr)?;
let pg_addr = &conf.listen_pg_addr;
let https_listener = match conf.listen_https_addr.as_ref() {
Some(https_addr) => {
info!("Starting pageserver https handler on {https_addr}");
Some(tcp_listener::bind(https_addr)?)
}
None => None,
};
let pg_addr = &conf.listen_pg_addr;
info!("Starting pageserver pg protocol handler on {pg_addr}");
let pageserver_listener = tcp_listener::bind(pg_addr)?;
// Enable SO_KEEPALIVE on the socket, to detect dead connections faster.
// These are configured via net.ipv4.tcp_keepalive_* sysctls.
//
// TODO: also set this on the walreceiver socket, but tokio-postgres doesn't
// support enabling keepalives while using the default OS sysctls.
setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?;
// Launch broker client
// The storage_broker::connect call needs to happen inside a tokio runtime thread.
let broker_client = WALRECEIVER_RUNTIME
@@ -567,9 +583,8 @@ fn start_pageserver(
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
let http_endpoint_listener = {
let (http_endpoint_listener, https_endpoint_listener) = {
let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
let cancel = CancellationToken::new();
let router_state = Arc::new(
http::routes::State::new(
@@ -584,22 +599,51 @@ fn start_pageserver(
)
.context("Failed to initialize router state")?,
);
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
.build()
.map_err(|err| anyhow!(err))?;
let service = http_utils::RouterService::new(router).unwrap();
let server = hyper0::Server::from_tcp(http_listener)?
.serve(service)
.with_graceful_shutdown({
let cancel = cancel.clone();
async move { cancel.clone().cancelled().await }
});
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"http endpoint listener",
server,
));
HttpEndpointListener(CancellableTask { task, cancel })
let service =
Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?);
let http_task = {
let server =
http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?;
let cancel = CancellationToken::new();
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"http endpoint listener",
server.serve(cancel.clone()),
));
HttpEndpointListener(CancellableTask { task, cancel })
};
let https_task = match https_listener {
Some(https_listener) => {
let certs = load_certs(&conf.ssl_cert_file)?;
let key = load_private_key(&conf.ssl_key_file)?;
let server_config = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_single_cert(certs, key)?;
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
let server =
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
let cancel = CancellationToken::new();
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"https endpoint listener",
server.serve(cancel.clone()),
));
Some(HttpsEndpointListener(CancellableTask { task, cancel }))
}
None => None,
};
(http_task, https_task)
};
let consumption_metrics_tasks = {
@@ -675,6 +719,7 @@ fn start_pageserver(
shutdown_pageserver.cancel();
pageserver::shutdown_pageserver(
http_endpoint_listener,
https_endpoint_listener,
page_service,
consumption_metrics_tasks,
disk_usage_eviction_task,
@@ -689,6 +734,25 @@ fn start_pageserver(
})
}
fn load_certs(filename: &Utf8Path) -> std::io::Result<Vec<CertificateDer<'static>>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
rustls_pemfile::certs(&mut reader).collect()
}
fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
let file = std::fs::File::open(filename)?;
let mut reader = std::io::BufReader::new(file);
let key = rustls_pemfile::private_key(&mut reader)?;
key.ok_or(anyhow::anyhow!(
"no private key found in {}",
filename.as_str(),
))
}
async fn create_remote_storage_client(
conf: &'static PageServerConf,
) -> anyhow::Result<GenericRemoteStorage> {

View File

@@ -53,6 +53,11 @@ pub struct PageServerConf {
pub listen_pg_addr: String,
/// Example (default): 127.0.0.1:9898
pub listen_http_addr: String,
/// Example: 127.0.0.1:9899
pub listen_https_addr: Option<String>,
pub ssl_key_file: Utf8PathBuf,
pub ssl_cert_file: Utf8PathBuf,
/// Current availability zone. Used for traffic metrics.
pub availability_zone: Option<String>,
@@ -194,6 +199,13 @@ pub struct PageServerConf {
/// Interpreted protocol feature: if enabled, validate that the logical WAL received from
/// safekeepers does not have gaps.
pub validate_wal_contiguity: bool,
/// When set, the previously written to disk heatmap is loaded on tenant attach and used
/// to avoid clobbering the heatmap from new, cold, attached locations.
pub load_previous_heatmap: bool,
/// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
pub generate_unarchival_heatmap: bool,
}
/// Token for authentication to safekeepers
@@ -310,6 +322,9 @@ impl PageServerConf {
let pageserver_api::config::ConfigToml {
listen_pg_addr,
listen_http_addr,
listen_https_addr,
ssl_key_file,
ssl_cert_file,
availability_zone,
wait_lsn_timeout,
wal_redo_timeout,
@@ -358,6 +373,8 @@ impl PageServerConf {
get_vectored_concurrent_io,
enable_read_path_debugging,
validate_wal_contiguity,
load_previous_heatmap,
generate_unarchival_heatmap,
} = config_toml;
let mut conf = PageServerConf {
@@ -366,6 +383,9 @@ impl PageServerConf {
// ------------------------------------------------------------
listen_pg_addr,
listen_http_addr,
listen_https_addr,
ssl_key_file,
ssl_cert_file,
availability_zone,
wait_lsn_timeout,
wal_redo_timeout,
@@ -447,6 +467,8 @@ impl PageServerConf {
no_sync: no_sync.unwrap_or(false),
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
load_previous_heatmap: load_previous_heatmap.unwrap_or(true),
generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true),
};
// ------------------------------------------------------------
@@ -480,7 +502,9 @@ impl PageServerConf {
#[cfg(test)]
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
let test_id = uuid::Uuid::new_v4();
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
}
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -493,6 +517,8 @@ impl PageServerConf {
metric_collection_interval: Duration::from_secs(60),
synthetic_size_calculation_interval: Duration::from_secs(60),
background_task_maximum_delay: Duration::ZERO,
load_previous_heatmap: Some(true),
generate_unarchival_heatmap: Some(true),
..Default::default()
};
PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()

View File

@@ -89,16 +89,112 @@
//! [`RequestContext`] argument. Functions in the middle of the call chain
//! only need to pass it on.
use crate::task_mgr::TaskKind;
use std::sync::Arc;
use once_cell::sync::Lazy;
use tracing::warn;
use utils::{id::TimelineId, shard::TenantShardId};
use crate::{
metrics::{StorageIoSizeMetrics, TimelineMetrics},
task_mgr::TaskKind,
tenant::Timeline,
};
// The main structure of this module, see module-level comment.
#[derive(Debug)]
pub struct RequestContext {
task_kind: TaskKind,
download_behavior: DownloadBehavior,
access_stats_behavior: AccessStatsBehavior,
page_content_kind: PageContentKind,
read_path_debug: bool,
scope: Scope,
}
#[derive(Clone)]
pub(crate) enum Scope {
Global {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
SecondaryTenant {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
SecondaryTimeline {
io_size_metrics: crate::metrics::StorageIoSizeMetrics,
},
Timeline {
// We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
// context creation contending for the ref counters of the Arc<TimelineMetrics>,
// which are shared among all tasks that operate on the timeline, especially
// concurrent page_service connections.
#[allow(clippy::redundant_allocation)]
arc_arc: Arc<Arc<TimelineMetrics>>,
},
#[cfg(test)]
UnitTest {
io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
},
}
static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));
impl Scope {
pub(crate) fn new_global() -> Self {
Scope::Global {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
/// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
/// of a compaction iteration.
pub(crate) fn new_timeline(timeline: &Timeline) -> Self {
Scope::Timeline {
arc_arc: Arc::new(Arc::clone(&timeline.metrics)),
}
}
pub(crate) fn new_page_service_pagestream(
timeline_handle: &crate::tenant::timeline::handle::Handle<
crate::page_service::TenantManagerTypes,
>,
) -> Self {
Scope::Timeline {
arc_arc: Arc::clone(&timeline_handle.metrics),
}
}
pub(crate) fn new_secondary_timeline(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Self {
// TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = tenant_shard_id.shard_slug().to_string();
let timeline_id = timeline_id.to_string();
let io_size_metrics =
crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
Scope::SecondaryTimeline { io_size_metrics }
}
pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
// Before propagating metrics via RequestContext, the labels were inferred from file path.
// The only user of VirtualFile at tenant scope is the heatmap download & read.
// The inferred labels for the path of the heatmap file on local disk were that of the global metric (*,*,*).
// Thus, we do the same here, and extend that for anything secondary-tenant scoped.
//
// If we want to have (tenant_id, shard_id, '*') labels for secondary tenants in the future,
// we will need to think about the metric lifecycle, i.e., remove them during secondary tenant shutdown,
// like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
// at this point, so, we were able to completely side-step tenant-scoped stuff there).
Scope::SecondaryTenant {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
#[cfg(test)]
pub(crate) fn new_unit_test() -> Self {
Scope::UnitTest {
io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
}
}
}
/// The kind of access to the page cache.
@@ -157,6 +253,7 @@ impl RequestContextBuilder {
access_stats_behavior: AccessStatsBehavior::Update,
page_content_kind: PageContentKind::Unknown,
read_path_debug: false,
scope: Scope::new_global(),
},
}
}
@@ -171,10 +268,16 @@ impl RequestContextBuilder {
access_stats_behavior: original.access_stats_behavior,
page_content_kind: original.page_content_kind,
read_path_debug: original.read_path_debug,
scope: original.scope.clone(),
},
}
}
pub fn task_kind(mut self, k: TaskKind) -> Self {
self.inner.task_kind = k;
self
}
/// Configure the DownloadBehavior of the context: whether to
/// download missing layers, and/or warn on the download.
pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
@@ -199,6 +302,11 @@ impl RequestContextBuilder {
self
}
pub(crate) fn scope(mut self, s: Scope) -> Self {
self.inner.scope = s;
self
}
pub fn build(self) -> RequestContext {
self.inner
}
@@ -281,7 +389,50 @@ impl RequestContext {
}
fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
Self::new(task_kind, download_behavior)
RequestContextBuilder::extend(self)
.task_kind(task_kind)
.download_behavior(download_behavior)
.build()
}
pub fn with_scope_timeline(&self, timeline: &Arc<Timeline>) -> Self {
RequestContextBuilder::extend(self)
.scope(Scope::new_timeline(timeline))
.build()
}
pub(crate) fn with_scope_page_service_pagestream(
&self,
timeline_handle: &crate::tenant::timeline::handle::Handle<
crate::page_service::TenantManagerTypes,
>,
) -> Self {
RequestContextBuilder::extend(self)
.scope(Scope::new_page_service_pagestream(timeline_handle))
.build()
}
pub fn with_scope_secondary_timeline(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Self {
RequestContextBuilder::extend(self)
.scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id))
.build()
}
pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self {
RequestContextBuilder::extend(self)
.scope(Scope::new_secondary_tenant(tenant_shard_id))
.build()
}
#[cfg(test)]
pub fn with_scope_unit_test(&self) -> Self {
RequestContextBuilder::new(TaskKind::UnitTest)
.scope(Scope::new_unit_test())
.build()
}
pub fn task_kind(&self) -> TaskKind {
@@ -303,4 +454,38 @@ impl RequestContext {
pub(crate) fn read_path_debug(&self) -> bool {
self.read_path_debug
}
pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
match &self.scope {
Scope::Global { io_size_metrics } => {
let is_unit_test = cfg!(test);
let is_regress_test_build = cfg!(feature = "testing");
if is_unit_test || is_regress_test_build {
panic!("all VirtualFile instances are timeline-scoped");
} else {
use once_cell::sync::Lazy;
use std::sync::Mutex;
use std::time::Duration;
use utils::rate_limit::RateLimit;
static LIMIT: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
let mut guard = LIMIT.lock().unwrap();
guard.call2(|rate_limit_stats| {
warn!(
%rate_limit_stats,
backtrace=%std::backtrace::Backtrace::force_capture(),
"all VirtualFile instances are timeline-scoped",
);
});
io_size_metrics
}
}
Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
#[cfg(test)]
Scope::UnitTest { io_size_metrics } => io_size_metrics,
}
}
}

View File

@@ -84,6 +84,7 @@ impl ControllerUpcallClient {
})
}
#[tracing::instrument(skip_all)]
async fn retry_http_forever<R, T>(
&self,
url: &url::Url,
@@ -108,7 +109,7 @@ impl ControllerUpcallClient {
|_| false,
3,
u32::MAX,
"calling control plane generation validation API",
"storage controller upcall",
&self.cancel,
)
.await
@@ -125,11 +126,12 @@ impl ControllerUpcallClient {
impl ControlPlaneGenerationsApi for ControllerUpcallClient {
/// Block until we get a successful response, or error out if we are shut down
#[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
async fn re_attach(
&self,
conf: &PageServerConf,
) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
let re_attach_path = self
let url = self
.base_url
.join("re-attach")
.expect("Failed to build re-attach path");
@@ -179,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
listen_pg_port: m.postgres_port,
listen_http_addr: m.http_host,
listen_http_port: m.http_port,
listen_https_port: None, // TODO: Support https.
listen_https_port: m.https_port,
availability_zone_id: az_id.expect("Checked above"),
})
}
@@ -205,7 +207,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
register: register.clone(),
};
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
let response: ReAttachResponse = self.retry_http_forever(&url, request).await?;
tracing::info!(
"Received re-attach response with {} tenants (node {}, register: {:?})",
response.tenants.len(),
@@ -223,11 +225,12 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
}
/// Block until we get a successful response, or error out if we are shut down
#[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
let re_attach_path = self
let url = self
.base_url
.join("validate")
.expect("Failed to build validate path");
@@ -257,8 +260,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
return Err(RetryForeverError::ShuttingDown);
}
let response: ValidateResponse =
self.retry_http_forever(&re_attach_path, request).await?;
let response: ValidateResponse = self.retry_http_forever(&url, request).await?;
for rt in response.tenants {
result.insert(rt.id, rt.valid);
}

View File

@@ -842,6 +842,12 @@ paths:
required: false
schema:
type: integer
- name: recurse
description: When set, will recurse with the downloads into ancestor timelines
in: query
required: false
schema:
type: boolean
post:
description: |
Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter

View File

@@ -37,7 +37,8 @@ use pageserver_api::models::{
TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
TopTenantShardsRequest, TopTenantShardsResponse,
};
use pageserver_api::shard::{ShardCount, TenantShardId};
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
@@ -54,6 +55,7 @@ use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use crate::config::PageServerConf;
use crate::context;
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
use crate::deletion_queue::DeletionQueueClient;
use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -63,6 +65,7 @@ use crate::tenant::mgr::{
GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
};
use crate::tenant::remote_timeline_client::index::GcCompactionState;
use crate::tenant::remote_timeline_client::{
download_index_part, list_remote_tenant_shards, list_remote_timelines,
};
@@ -481,6 +484,7 @@ async fn build_timeline_info_common(
state,
is_archived: Some(is_archived),
rel_size_migration: Some(timeline.get_rel_size_v2_status()),
walreceiver_status,
};
@@ -857,6 +861,75 @@ async fn timeline_archival_config_handler(
json_response(StatusCode::OK, ())
}
/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
/// measure only.
///
/// Some examples of safe patches:
/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
/// - Force set the index part to use reldir v2 (migrating/migrated).
///
/// Some examples of unsafe patches:
/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
/// errors.
/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
async fn timeline_patch_index_part_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
check_permission(&request, None)?; // require global permission for this request
let state = get_state(&request);
async {
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
if let Some(rel_size_migration) = request_data.rel_size_migration {
timeline
.update_rel_size_v2_status(rel_size_migration)
.map_err(ApiError::InternalServerError)?;
}
if let Some(gc_compaction_last_completed_lsn) =
request_data.gc_compaction_last_completed_lsn
{
timeline
.update_gc_compaction_state(GcCompactionState {
last_completed_lsn: gc_compaction_last_completed_lsn,
})
.map_err(ApiError::InternalServerError)?;
}
if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
{
let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
guard.store_and_unlock(applied_gc_cutoff_lsn);
}
}
if request_data.force_index_update {
timeline
.remote_client
.force_schedule_index_upload()
.context("force schedule index upload")
.map_err(ApiError::InternalServerError)?;
}
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_patch_index_part",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
%timeline_id))
.await?;
json_response(StatusCode::OK, ())
}
async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -881,12 +954,13 @@ async fn timeline_detail_handler(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline = tenant.get_timeline(timeline_id, false)?;
let ctx = &ctx.with_scope_timeline(&timeline);
let timeline_info = build_timeline_info(
&timeline,
include_non_incremental_logical_size.unwrap_or(false),
force_await_initial_logical_size.unwrap_or(false),
&ctx,
ctx,
)
.await
.context("get local timeline info")
@@ -927,11 +1001,11 @@ async fn get_lsn_by_timestamp_handler(
let with_lease = parse_query_param(&request, "with_lease")?.unwrap_or(false);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let result = timeline
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
.await?;
@@ -1000,10 +1074,11 @@ async fn get_timestamp_of_lsn_handler(
.with_context(|| format!("Invalid LSN: {lsn_str:?}"))
.map_err(ApiError::BadRequest)?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
match result {
@@ -1358,7 +1433,8 @@ async fn timeline_layer_scan_disposable_keys(
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let guard = timeline.layers.read().await;
let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
@@ -1368,7 +1444,7 @@ async fn timeline_layer_scan_disposable_keys(
};
let resident_layer = layer
.download_and_keep_resident()
.download_and_keep_resident(&ctx)
.await
.map_err(|err| match err {
tenant::storage_layer::layer::DownloadError::TimelineShutdown
@@ -1436,6 +1512,7 @@ async fn timeline_download_heatmap_layers_handler(
let desired_concurrency =
parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
@@ -1443,6 +1520,8 @@ async fn timeline_download_heatmap_layers_handler(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let max_concurrency = get_config(&request)
.remote_storage_config
@@ -1451,7 +1530,7 @@ async fn timeline_download_heatmap_layers_handler(
.unwrap_or(DEFAULT_MAX_CONCURRENCY);
let concurrency = std::cmp::min(max_concurrency, desired_concurrency);
timeline.start_heatmap_layers_download(concurrency).await?;
timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?;
json_response(StatusCode::ACCEPTED, ())
}
@@ -1490,8 +1569,10 @@ async fn layer_download_handler(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
let downloaded = timeline
.download_layer(&layer_name)
.download_layer(&layer_name, &ctx)
.await
.map_err(|e| match e {
tenant::storage_layer::layer::DownloadError::TimelineShutdown
@@ -2225,8 +2306,8 @@ async fn timeline_compact_handler(
.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
if scheduled {
let tenant = state
.tenant_manager
@@ -2333,8 +2414,8 @@ async fn timeline_checkpoint_handler(
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
if wait_until_flushed {
timeline.freeze_and_flush().await
} else {
@@ -2389,7 +2470,9 @@ async fn timeline_download_remote_layers_handler_post(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
match timeline.spawn_download_all_remote_layers(body).await {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
.with_scope_timeline(&timeline);
match timeline.spawn_download_all_remote_layers(body, &ctx).await {
Ok(st) => json_response(StatusCode::ACCEPTED, st),
Err(st) => json_response(StatusCode::CONFLICT, st),
}
@@ -2471,6 +2554,7 @@ async fn timeline_detach_ancestor_handler(
tracing::info!("all timeline upload queues are drained");
let timeline = tenant.get_timeline(timeline_id, true)?;
let ctx = &ctx.with_scope_timeline(&timeline);
let progress = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
@@ -2577,8 +2661,9 @@ async fn getpage_at_lsn_handler_inner(
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
// Enable read path debugging
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true)
.scope(context::Scope::new_timeline(&timeline)).build();
// Use last_record_lsn if no lsn is provided
let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
@@ -2612,8 +2697,8 @@ async fn timeline_collect_keyspace(
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download).with_scope_timeline(&timeline);
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let (dense_ks, sparse_ks) = timeline
.collect_keyspace(at_lsn, &ctx)
@@ -3250,7 +3335,7 @@ async fn put_tenant_timeline_import_basebackup(
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let timeline = tenant
let (timeline, timeline_ctx) = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
.map_err(ApiError::InternalServerError)
.await?;
@@ -3269,7 +3354,13 @@ async fn put_tenant_timeline_import_basebackup(
info!("importing basebackup");
timeline
.import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
.import_basebackup_from_tar(
tenant.clone(),
&mut body,
base_lsn,
broker_client,
&timeline_ctx,
)
.await
.map_err(ApiError::InternalServerError)?;
@@ -3309,6 +3400,7 @@ async fn put_tenant_timeline_import_wal(
let state = get_state(&request);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build();
let mut body = StreamReader::new(request.into_body().map(|res| {
res.map_err(|error| {
@@ -3625,6 +3717,10 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|r| api_handler(r, get_timestamp_of_lsn_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
|r| api_handler(r, timeline_patch_index_part_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|r| api_handler(r, lsn_lease_handler),

View File

@@ -64,6 +64,7 @@ pub struct CancellableTask {
pub cancel: CancellationToken,
}
pub struct HttpEndpointListener(pub CancellableTask);
pub struct HttpsEndpointListener(pub CancellableTask);
pub struct ConsumptionMetricsTasks(pub CancellableTask);
pub struct DiskUsageEvictionTask(pub CancellableTask);
impl CancellableTask {
@@ -77,6 +78,7 @@ impl CancellableTask {
#[allow(clippy::too_many_arguments)]
pub async fn shutdown_pageserver(
http_listener: HttpEndpointListener,
https_listener: Option<HttpsEndpointListener>,
page_service: page_service::Listener,
consumption_metrics_worker: ConsumptionMetricsTasks,
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
@@ -213,6 +215,15 @@ pub async fn shutdown_pageserver(
)
.await;
if let Some(https_listener) = https_listener {
timed(
https_listener.0.shutdown(),
"shutdown https",
Duration::from_secs(1),
)
.await;
}
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.

View File

@@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_per_read_batch_global",
"Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
vec![
1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
],
)
.expect("failed to define a metric")
});
pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_per_read_amortized_global",
"Layers visited to serve a single read (read amplification). Amortized across a batch: \
all visited layers are divided by number of reads.",
vec![
1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
],
)
.expect("failed to define a metric")
});
pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
// We expect this to be low because of Postgres checkpoints. Let's see if that holds.
register_histogram!(
@@ -1204,11 +1227,24 @@ impl StorageIoTime {
pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(StorageIoTime::new);
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
#[derive(Clone, Copy)]
#[repr(usize)]
enum StorageIoSizeOperation {
Read,
Write,
}
impl StorageIoSizeOperation {
const VARIANTS: &'static [&'static str] = &["read", "write"];
fn as_str(&self) -> &'static str {
Self::VARIANTS[*self as usize]
}
}
// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_io_operations_bytes_total",
"Total amount of bytes read/written in IO operations",
&["operation", "tenant_id", "shard_id", "timeline_id"]
@@ -1216,6 +1252,34 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
#[derive(Clone, Debug)]
pub(crate) struct StorageIoSizeMetrics {
pub read: UIntGauge,
pub write: UIntGauge,
}
impl StorageIoSizeMetrics {
pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
let read = STORAGE_IO_SIZE
.get_metric_with_label_values(&[
StorageIoSizeOperation::Read.as_str(),
tenant_id,
shard_id,
timeline_id,
])
.unwrap();
let write = STORAGE_IO_SIZE
.get_metric_with_label_values(&[
StorageIoSizeOperation::Write.as_str(),
tenant_id,
shard_id,
timeline_id,
])
.unwrap();
Self { read, write }
}
}
#[cfg(not(test))]
pub(crate) mod virtual_file_descriptor_cache {
use super::*;
@@ -2798,6 +2862,7 @@ pub(crate) struct TimelineMetrics {
/// Number of valid LSN leases.
pub valid_lsn_lease_count_gauge: UIntGauge,
pub wal_records_received: IntCounter,
pub storage_io_size: StorageIoSizeMetrics,
shutdown: std::sync::atomic::AtomicBool,
}
@@ -2933,6 +2998,8 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
TimelineMetrics {
tenant_id,
shard_id,
@@ -2962,6 +3029,7 @@ impl TimelineMetrics {
evictions_with_low_residence_duration: std::sync::RwLock::new(
evictions_with_low_residence_duration,
),
storage_io_size,
valid_lsn_lease_count_gauge,
wal_records_received,
shutdown: std::sync::atomic::AtomicBool::default(),
@@ -3152,7 +3220,7 @@ impl TimelineMetrics {
]);
}
for op in STORAGE_IO_SIZE_OPERATIONS {
for op in StorageIoSizeOperation::VARIANTS {
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
}
@@ -4074,6 +4142,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
// histograms
[
&LAYERS_PER_READ_GLOBAL,
&LAYERS_PER_READ_BATCH_GLOBAL,
&LAYERS_PER_READ_AMORTIZED_GLOBAL,
&DELTAS_PER_READ_GLOBAL,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,

View File

@@ -56,6 +56,7 @@ use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::{
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
TimelineMetrics,
};
use crate::pgdatadir_mapping::Version;
use crate::span::{
@@ -392,10 +393,6 @@ impl TimelineHandles {
.await
.map_err(|e| match e {
timeline::handle::GetError::TenantManager(e) => e,
timeline::handle::GetError::TimelineGateClosed => {
trace!("timeline gate closed");
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
}
timeline::handle::GetError::PerTimelineStateShutDown => {
trace!("per-timeline state shut down");
GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
@@ -422,24 +419,36 @@ pub(crate) struct TenantManagerTypes;
impl timeline::handle::Types for TenantManagerTypes {
type TenantManagerError = GetActiveTimelineError;
type TenantManager = TenantManagerWrapper;
type Timeline = Arc<Timeline>;
type Timeline = TenantManagerCacheItem;
}
impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
fn gate(&self) -> &utils::sync::gate::Gate {
&self.gate
}
pub(crate) struct TenantManagerCacheItem {
pub(crate) timeline: Arc<Timeline>,
// allow() for cheap propagation through RequestContext inside a task
#[allow(clippy::redundant_allocation)]
pub(crate) metrics: Arc<Arc<TimelineMetrics>>,
#[allow(dead_code)] // we store it to keep the gate open
pub(crate) gate_guard: GateGuard,
}
impl std::ops::Deref for TenantManagerCacheItem {
type Target = Arc<Timeline>;
fn deref(&self) -> &Self::Target {
&self.timeline
}
}
impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
Timeline::shard_timeline_id(self)
Timeline::shard_timeline_id(&self.timeline)
}
fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
&self.handles
&self.timeline.handles
}
fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
Timeline::get_shard_identity(self)
Timeline::get_shard_identity(&self.timeline)
}
}
@@ -448,7 +457,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
&self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
let tenant_id = self.tenant_id.get().expect("we set this in get()");
let timeout = ACTIVE_TENANT_TIMEOUT;
let wait_start = Instant::now();
@@ -491,7 +500,23 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
let timeline = tenant_shard
.get_timeline(timeline_id, true)
.map_err(GetActiveTimelineError::Timeline)?;
Ok(timeline)
let gate_guard = match timeline.gate.enter() {
Ok(guard) => guard,
Err(_) => {
return Err(GetActiveTimelineError::Timeline(
GetTimelineError::ShuttingDown,
));
}
};
let metrics = Arc::new(Arc::clone(&timeline.metrics));
Ok(TenantManagerCacheItem {
timeline,
metrics,
gate_guard,
})
}
}
@@ -1220,6 +1245,14 @@ impl PageServerHandler {
),
QueryError,
> {
macro_rules! upgrade_handle_and_set_context {
($shard:ident) => {{
let weak_handle = &$shard;
let handle = weak_handle.upgrade()?;
let ctx = ctx.with_scope_page_service_pagestream(&handle);
(handle, ctx)
}};
}
Ok(match batch {
BatchedFeMessage::Exists {
span,
@@ -1228,9 +1261,10 @@ impl PageServerHandler {
req,
} => {
fail::fail_point!("ps::handle-pagerequest-message::exists");
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
(
vec![
self.handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
self.handle_get_rel_exists_request(&shard, &req, &ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
@@ -1246,9 +1280,10 @@ impl PageServerHandler {
req,
} => {
fail::fail_point!("ps::handle-pagerequest-message::nblocks");
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
(
vec![
self.handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
self.handle_get_nblocks_request(&shard, &req, &ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
@@ -1264,17 +1299,18 @@ impl PageServerHandler {
pages,
} => {
fail::fail_point!("ps::handle-pagerequest-message::getpage");
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
(
{
let npages = pages.len();
trace!(npages, "handling getpage request");
let res = self
.handle_get_page_at_lsn_request_batched(
&*shard.upgrade()?,
&shard,
effective_request_lsn,
pages,
io_concurrency,
ctx,
&ctx,
)
.instrument(span.clone())
.await;
@@ -1291,9 +1327,10 @@ impl PageServerHandler {
req,
} => {
fail::fail_point!("ps::handle-pagerequest-message::dbsize");
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
(
vec![
self.handle_db_size_request(&*shard.upgrade()?, &req, ctx)
self.handle_db_size_request(&shard, &req, &ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
@@ -1309,9 +1346,10 @@ impl PageServerHandler {
req,
} => {
fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
(
vec![
self.handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
self.handle_get_slru_segment_request(&shard, &req, &ctx)
.instrument(span.clone())
.await
.map(|msg| (msg, timer))
@@ -1327,12 +1365,13 @@ impl PageServerHandler {
requests,
} => {
fail::fail_point!("ps::handle-pagerequest-message::test");
let (shard, ctx) = upgrade_handle_and_set_context!(shard);
(
{
let npages = requests.len();
trace!(npages, "handling getpage request");
let res = self
.handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
.handle_test_request_batch(&shard, requests, &ctx)
.instrument(span.clone())
.await;
assert_eq!(res.len(), npages);
@@ -2095,6 +2134,7 @@ impl PageServerHandler {
// TODO: passthrough the error site to the final error message?
BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
BasebackupError::Server(e) => QueryError::Other(e),
BasebackupError::Shutdown => QueryError::Shutdown,
}
}
@@ -2107,6 +2147,7 @@ impl PageServerHandler {
.get(tenant_id, timeline_id, ShardSelector::Zero)
.await?;
set_tracing_field_shard_id(&timeline);
let ctx = ctx.with_scope_timeline(&timeline);
if timeline.is_archived() == Some(true) {
tracing::info!(
@@ -2124,7 +2165,7 @@ impl PageServerHandler {
lsn,
crate::tenant::timeline::WaitLsnWaiter::PageService,
crate::tenant::timeline::WaitLsnTimeout::Default,
ctx,
&ctx,
)
.await?;
timeline
@@ -2150,7 +2191,7 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
ctx,
&ctx,
)
.await
.map_err(map_basebackup_error)?;
@@ -2173,7 +2214,7 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
ctx,
&ctx,
)
.await
.map_err(map_basebackup_error)?;
@@ -2190,7 +2231,7 @@ impl PageServerHandler {
prev_lsn,
full_backup,
replica,
ctx,
&ctx,
)
.await
.map_err(map_basebackup_error)?;

View File

@@ -21,6 +21,7 @@ use pageserver_api::key::{
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::models::RelSizeMigration;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
@@ -492,7 +493,9 @@ impl Timeline {
// Otherwise, read the old reldir keyspace.
// TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
if self.get_rel_size_v2_enabled() {
if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
self.get_rel_size_v2_status()
{
// fetch directory listing (new)
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
@@ -544,7 +547,7 @@ impl Timeline {
forknum: *forknum,
}));
if !self.get_rel_size_v2_enabled() {
if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
return Ok(rels_v1);
}
@@ -599,28 +602,36 @@ impl Timeline {
let n_blocks = self
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
.await?;
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
for blkno in 0..n_blocks {
let block = self
.get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
.await?;
segment.extend_from_slice(&block[..BLCKSZ as usize]);
}
Ok(segment.freeze())
}
/// Look up given SLRU page version.
pub(crate) async fn get_slru_page_at_lsn(
&self,
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
assert!(self.tenant_shard_id.is_shard_zero());
let key = slru_block_to_key(kind, segno, blknum);
self.get(key, lsn, ctx).await
let keyspace = KeySpace::single(
slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
);
let batches = keyspace.partition(
self.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
);
let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
for batch in batches.parts {
let blocks = self
.get_vectored(batch, lsn, io_concurrency.clone(), ctx)
.await?;
for (_key, block) in blocks {
let block = block?;
segment.extend_from_slice(&block[..BLCKSZ as usize]);
}
}
Ok(segment.freeze())
}
/// Get size of an SLRU segment
@@ -829,19 +840,41 @@ impl Timeline {
let nblocks = self
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
.await?;
for blknum in (0..nblocks).rev() {
let clog_page = self
.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
let keyspace = KeySpace::single(
slru_block_to_key(SlruKind::Clog, segno, 0)
..slru_block_to_key(SlruKind::Clog, segno, nblocks),
);
let batches = keyspace.partition(
self.get_shard_identity(),
Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
);
for batch in batches.parts.into_iter().rev() {
let blocks = self
.get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
.await?;
if clog_page.len() == BLCKSZ as usize + 8 {
let mut timestamp_bytes = [0u8; 8];
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
for (_key, clog_page) in blocks.into_iter().rev() {
let clog_page = clog_page?;
match f(timestamp) {
ControlFlow::Break(b) => return Ok(b),
ControlFlow::Continue(()) => (),
if clog_page.len() == BLCKSZ as usize + 8 {
let mut timestamp_bytes = [0u8; 8];
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
match f(timestamp) {
ControlFlow::Break(b) => return Ok(b),
ControlFlow::Continue(()) => (),
}
}
}
}
@@ -1052,6 +1085,8 @@ impl Timeline {
) -> Result<u64, CalculateLogicalSizeError> {
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) });
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
let dbdir = DbDirectory::des(&buf)?;
@@ -1718,6 +1753,35 @@ impl DatadirModification<'_> {
Ok(())
}
/// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
/// we enable it, we also need to persist it in `index_part.json`.
pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
let status = self.tline.get_rel_size_v2_status();
let config = self.tline.get_rel_size_v2_enabled();
match (config, status) {
(false, RelSizeMigration::Legacy) => {
// tenant config didn't enable it and we didn't write any reldir_v2 key yet
Ok(false)
}
(false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
// index_part already persisted that the timeline has enabled rel_size_v2
Ok(true)
}
(true, RelSizeMigration::Legacy) => {
// The first time we enable it, we need to persist it in `index_part.json`
self.tline
.update_rel_size_v2_status(RelSizeMigration::Migrating)?;
tracing::info!("enabled rel_size_v2");
Ok(true)
}
(true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
// index_part already persisted that the timeline has enabled rel_size_v2
// and we don't need to do anything
Ok(true)
}
}
}
/// Store a relmapper file (pg_filenode.map) in the repository
pub async fn put_relmap_file(
&mut self,
@@ -1726,6 +1790,8 @@ impl DatadirModification<'_> {
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
// Add it to the directory (if it doesn't exist already)
let buf = self.get(DBDIR_KEY, ctx).await?;
let mut dbdir = DbDirectory::des(&buf)?;
@@ -1746,7 +1812,7 @@ impl DatadirModification<'_> {
})?;
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
if self.tline.get_rel_size_v2_enabled() {
if v2_enabled {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
}
@@ -1898,12 +1964,12 @@ impl DatadirModification<'_> {
.context("deserialize db")?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
}
let v2_enabled = self.maybe_enable_rel_size_v2()?;
if self.tline.get_rel_size_v2_enabled() {
if v2_enabled {
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
}
let sparse_rel_dir_key =
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
// check if the rel_dir_key exists in v2
@@ -1938,6 +2004,10 @@ impl DatadirModification<'_> {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
} else {
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
}
if !dbdir_exists {
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
@@ -1951,6 +2021,7 @@ impl DatadirModification<'_> {
)),
);
}
// Put size
let size_key = rel_size_to_key(rel);
let buf = nblocks.to_le_bytes();
@@ -2029,6 +2100,7 @@ impl DatadirModification<'_> {
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
for ((spc_node, db_node), rel_tags) in drop_relations {
let dir_key = rel_dir_to_key(spc_node, db_node);
let buf = self.get(dir_key, ctx).await?;
@@ -2041,7 +2113,7 @@ impl DatadirModification<'_> {
.push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
dirty = true;
true
} else if self.tline.get_rel_size_v2_enabled() {
} else if v2_enabled {
// The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
// Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
// logic).
@@ -2072,7 +2144,7 @@ impl DatadirModification<'_> {
// Remove entry from relation size cache
self.tline.remove_cached_rel_size(&rel_tag);
// Delete size entry, as well as all blocks
// Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
self.delete(rel_key_range(rel_tag));
}
}
@@ -2686,7 +2758,7 @@ mod tests {
TimelineId::from_array(hex!("11223344556677881122334455667788"));
let (tenant, ctx) = harness.load().await;
let tline = tenant
let (tline, ctx) = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = tline.raw_timeline().unwrap();

View File

@@ -31,8 +31,8 @@ use futures::StreamExt;
use futures::stream::FuturesUnordered;
use itertools::Itertools as _;
use once_cell::sync::Lazy;
use pageserver_api::models;
pub use pageserver_api::models::TenantState;
use pageserver_api::models::{self, RelSizeMigration};
use pageserver_api::models::{
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
WalRedoManagerStatus,
@@ -77,6 +77,8 @@ use self::timeline::{
EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
};
use crate::config::PageServerConf;
use crate::context;
use crate::context::RequestContextBuilder;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::l0_flush::L0FlushGlobalState;
@@ -1114,7 +1116,7 @@ impl Tenant {
}
};
let timeline = self.create_timeline_struct(
let (timeline, timeline_ctx) = self.create_timeline_struct(
timeline_id,
&metadata,
previous_heatmap,
@@ -1123,6 +1125,8 @@ impl Tenant {
CreateTimelineCause::Load,
idempotency.clone(),
index_part.gc_compaction.clone(),
index_part.rel_size_migration.clone(),
ctx,
)?;
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!(
@@ -1149,16 +1153,19 @@ impl Tenant {
// a previous heatmap which contains all visible layers in the layer map.
// This previous heatmap will be used whenever a fresh heatmap is generated
// for the timeline.
if matches!(cause, LoadTimelineCause::Unoffload) {
if self.conf.generate_unarchival_heatmap && matches!(cause, LoadTimelineCause::Unoffload) {
let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
while let Some((tline, end_lsn)) = tline_ending_at {
let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
if !tline.is_previous_heatmap_active() {
// Another unearchived timeline might have generated a heatmap for this ancestor.
// If the current branch point greater than the previous one use the the heatmap
// we just generated - it should include more layers.
if !tline.should_keep_previous_heatmap(end_lsn) {
tline
.previous_heatmap
.store(Some(Arc::new(unarchival_heatmap)));
} else {
tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.")
tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.")
}
match tline.ancestor_timeline() {
@@ -1253,7 +1260,7 @@ impl Tenant {
match activate {
ActivateTimelineArgs::Yes { broker_client } => {
info!("activating timeline after reload from pgdata import task");
timeline.activate(self.clone(), broker_client, None, ctx);
timeline.activate(self.clone(), broker_client, None, &timeline_ctx);
}
ActivateTimelineArgs::No => (),
}
@@ -1578,6 +1585,10 @@ impl Tenant {
}
async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
if !self.conf.load_previous_heatmap {
return None;
}
let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
match tokio::fs::read_to_string(on_disk_heatmap_path).await {
Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
@@ -1757,6 +1768,7 @@ impl Tenant {
import_pgdata,
ActivateTimelineArgs::No,
guard,
ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
));
}
}
@@ -1774,6 +1786,7 @@ impl Tenant {
timeline_id,
&index_part.metadata,
remote_timeline_client,
ctx,
)
.instrument(tracing::info_span!("timeline_delete", %timeline_id))
.await
@@ -1939,6 +1952,7 @@ impl Tenant {
hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
heatmap: h,
read_at: hs.1,
end_lsn: None,
})
});
part_downloads.spawn(
@@ -2210,7 +2224,7 @@ impl Tenant {
self.clone(),
broker_client.clone(),
background_jobs_can_start,
&ctx,
&ctx.with_scope_timeline(&timeline),
);
}
@@ -2407,8 +2421,8 @@ impl Tenant {
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
pg_version: u32,
_ctx: &RequestContext,
) -> anyhow::Result<UninitializedTimeline> {
ctx: &RequestContext,
) -> anyhow::Result<(UninitializedTimeline, RequestContext)> {
anyhow::ensure!(
self.is_active(),
"Cannot create empty timelines on inactive tenant"
@@ -2442,6 +2456,8 @@ impl Tenant {
create_guard,
initdb_lsn,
None,
None,
ctx,
)
.await
}
@@ -2459,7 +2475,7 @@ impl Tenant {
pg_version: u32,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
let uninit_tl = self
let (uninit_tl, ctx) = self
.create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
.await?;
let tline = uninit_tl.raw_timeline().expect("we just created it");
@@ -2471,7 +2487,7 @@ impl Tenant {
.init_empty_test_timeline()
.context("init_empty_test_timeline")?;
modification
.commit(ctx)
.commit(&ctx)
.await
.context("commit init_empty_test_timeline modification")?;
@@ -2497,6 +2513,7 @@ impl Tenant {
initdb_lsn: Lsn,
pg_version: u32,
ctx: &RequestContext,
in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
end_lsn: Lsn,
@@ -2518,6 +2535,11 @@ impl Tenant {
.force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
.await?;
}
for in_memory in in_memory_layer_desc {
tline
.force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
.await?;
}
let layer_names = tline
.layers
.read()
@@ -2683,7 +2705,12 @@ impl Tenant {
// doing stuff before the IndexPart is durable in S3, which is done by the previous section.
let activated_timeline = match result {
CreateTimelineResult::Created(timeline) => {
timeline.activate(self.clone(), broker_client, None, ctx);
timeline.activate(
self.clone(),
broker_client,
None,
&ctx.with_scope_timeline(&timeline),
);
timeline
}
CreateTimelineResult::Idempotent(timeline) => {
@@ -2745,10 +2772,9 @@ impl Tenant {
}
};
let mut uninit_timeline = {
let (mut uninit_timeline, timeline_ctx) = {
let this = &self;
let initdb_lsn = Lsn(0);
let _ctx = ctx;
async move {
let new_metadata = TimelineMetadata::new(
// Initialize disk_consistent LSN to 0, The caller must import some data to
@@ -2767,6 +2793,8 @@ impl Tenant {
timeline_create_guard,
initdb_lsn,
None,
None,
ctx,
)
.await
}
@@ -2796,6 +2824,7 @@ impl Tenant {
index_part,
activate,
timeline_create_guard,
timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
));
// NB: the timeline doesn't exist in self.timelines at this point
@@ -2809,6 +2838,7 @@ impl Tenant {
index_part: import_pgdata::index_part_format::Root,
activate: ActivateTimelineArgs,
timeline_create_guard: TimelineCreateGuard,
ctx: RequestContext,
) {
debug_assert_current_span_has_tenant_and_timeline_id();
info!("starting");
@@ -2820,6 +2850,7 @@ impl Tenant {
index_part,
activate,
timeline_create_guard,
ctx,
)
.await;
if let Err(err) = &res {
@@ -2835,9 +2866,8 @@ impl Tenant {
index_part: import_pgdata::index_part_format::Root,
activate: ActivateTimelineArgs,
timeline_create_guard: TimelineCreateGuard,
ctx: RequestContext,
) -> Result<(), anyhow::Error> {
let ctx = RequestContext::new(TaskKind::ImportPgdata, DownloadBehavior::Warn);
info!("importing pgdata");
import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
.await
@@ -3046,6 +3076,7 @@ impl Tenant {
let mut has_pending_l0 = false;
for timeline in compact_l0 {
let ctx = &ctx.with_scope_timeline(&timeline);
let outcome = timeline
.compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
.instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
@@ -3079,6 +3110,7 @@ impl Tenant {
if !timeline.is_active() {
continue;
}
let ctx = &ctx.with_scope_timeline(&timeline);
let mut outcome = timeline
.compact(cancel, EnumSet::default(), ctx)
@@ -3141,11 +3173,13 @@ impl Tenant {
/// Trips the compaction circuit breaker if appropriate.
pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
match err {
err if err.is_cancel() => {}
CompactionError::ShuttingDown => (),
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
// shouldn't block compaction.
CompactionError::Offload(_) => {}
CompactionError::CollectKeySpaceError(err) => {
// CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
self.compaction_circuit_breaker
.lock()
.unwrap()
@@ -3302,7 +3336,7 @@ impl Tenant {
self.clone(),
broker_client.clone(),
background_jobs_can_start,
ctx,
&ctx.with_scope_timeline(timeline),
);
activated_timelines += 1;
}
@@ -4116,7 +4150,9 @@ impl Tenant {
cause: CreateTimelineCause,
create_idempotency: CreateTimelineIdempotency,
gc_compaction_state: Option<GcCompactionState>,
) -> anyhow::Result<Arc<Timeline>> {
rel_size_v2_status: Option<RelSizeMigration>,
ctx: &RequestContext,
) -> anyhow::Result<(Arc<Timeline>, RequestContext)> {
let state = match cause {
CreateTimelineCause::Load => {
let ancestor_id = new_metadata.ancestor_timeline();
@@ -4148,10 +4184,15 @@ impl Tenant {
self.attach_wal_lag_cooldown.clone(),
create_idempotency,
gc_compaction_state,
rel_size_v2_status,
self.cancel.child_token(),
);
Ok(timeline)
let timeline_ctx = RequestContextBuilder::extend(ctx)
.scope(context::Scope::new_timeline(&timeline))
.build();
Ok((timeline, timeline_ctx))
}
/// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
@@ -4567,6 +4608,7 @@ impl Tenant {
// Ensures all timelines use the same start time when computing the time cutoff.
let now_ts_for_pitr_calc = SystemTime::now();
for timeline in timelines.iter() {
let ctx = &ctx.with_scope_timeline(timeline);
let cutoff = timeline
.get_last_record_lsn()
.checked_sub(horizon)
@@ -4740,7 +4782,7 @@ impl Tenant {
src_timeline: &Arc<Timeline>,
dst_id: TimelineId,
start_lsn: Option<Lsn>,
_ctx: &RequestContext,
ctx: &RequestContext,
) -> Result<CreateTimelineResult, CreateTimelineError> {
let src_id = src_timeline.timeline_id;
@@ -4843,13 +4885,15 @@ impl Tenant {
src_timeline.pg_version,
);
let uninitialized_timeline = self
let (uninitialized_timeline, _timeline_ctx) = self
.prepare_new_timeline(
dst_id,
&metadata,
timeline_create_guard,
start_lsn + 1,
Some(Arc::clone(src_timeline)),
Some(src_timeline.get_rel_size_v2_status()),
ctx,
)
.await?;
@@ -5116,13 +5160,15 @@ impl Tenant {
pgdata_lsn,
pg_version,
);
let mut raw_timeline = self
let (mut raw_timeline, timeline_ctx) = self
.prepare_new_timeline(
timeline_id,
&new_metadata,
timeline_create_guard,
pgdata_lsn,
None,
None,
ctx,
)
.await?;
@@ -5133,7 +5179,7 @@ impl Tenant {
&unfinished_timeline,
&pgdata_path,
pgdata_lsn,
ctx,
&timeline_ctx,
)
.await
.with_context(|| {
@@ -5194,6 +5240,7 @@ impl Tenant {
/// An empty layer map is initialized, and new data and WAL can be imported starting
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
/// `finish_creation` to insert the Timeline into the timelines map.
#[allow(clippy::too_many_arguments)]
async fn prepare_new_timeline<'a>(
&'a self,
new_timeline_id: TimelineId,
@@ -5201,15 +5248,17 @@ impl Tenant {
create_guard: TimelineCreateGuard,
start_lsn: Lsn,
ancestor: Option<Arc<Timeline>>,
) -> anyhow::Result<UninitializedTimeline<'a>> {
rel_size_v2_status: Option<RelSizeMigration>,
ctx: &RequestContext,
) -> anyhow::Result<(UninitializedTimeline<'a>, RequestContext)> {
let tenant_shard_id = self.tenant_shard_id;
let resources = self.build_timeline_resources(new_timeline_id);
resources
.remote_client
.init_upload_queue_for_empty_remote(new_metadata)?;
.init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;
let timeline_struct = self
let (timeline_struct, timeline_ctx) = self
.create_timeline_struct(
new_timeline_id,
new_metadata,
@@ -5219,6 +5268,8 @@ impl Tenant {
CreateTimelineCause::Load,
create_guard.idempotency.clone(),
None,
rel_size_v2_status,
ctx,
)
.context("Failed to create timeline data structure")?;
@@ -5239,10 +5290,13 @@ impl Tenant {
"Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}"
);
Ok(UninitializedTimeline::new(
self,
new_timeline_id,
Some((timeline_struct, create_guard)),
Ok((
UninitializedTimeline::new(
self,
new_timeline_id,
Some((timeline_struct, create_guard)),
),
timeline_ctx,
))
}
@@ -5777,7 +5831,8 @@ pub(crate) mod harness {
}
pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
.with_scope_unit_test();
(
self.do_try_load(&ctx)
.await
@@ -5907,6 +5962,8 @@ mod tests {
#[cfg(feature = "testing")]
use timeline::GcInfo;
#[cfg(feature = "testing")]
use timeline::InMemoryLayerTestDesc;
#[cfg(feature = "testing")]
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
use timeline::{CompactOptions, DeltaLayerTestDesc};
use utils::id::TenantId;
@@ -6798,7 +6855,7 @@ mod tests {
let (tenant, ctx) = harness.load().await;
let io_concurrency = IoConcurrency::spawn_for_test();
let tline = tenant
let (tline, ctx) = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = tline.raw_timeline().unwrap();
@@ -7420,7 +7477,7 @@ mod tests {
.await;
let initdb_lsn = Lsn(0x20);
let utline = tenant
let (utline, ctx) = tenant
.create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = utline.raw_timeline().unwrap();
@@ -7487,7 +7544,7 @@ mod tests {
let harness = TenantHarness::create(name).await?;
{
let (tenant, ctx) = harness.load().await;
let tline = tenant
let (tline, _ctx) = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
.await?;
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
@@ -7919,6 +7976,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
Vec::new(), // delta layers
vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
@@ -8006,6 +8064,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
Vec::new(), // delta layers
vec![(
Lsn(0x20),
@@ -8221,6 +8280,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8301,6 +8361,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8374,6 +8435,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
// delta layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8506,6 +8568,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -8699,6 +8762,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x40),
delta1,
@@ -8755,6 +8819,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
Vec::new(),
image_layers,
end_lsn,
@@ -8961,6 +9026,7 @@ mod tests {
Lsn(0x08),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x08)..Lsn(0x10),
@@ -8979,7 +9045,7 @@ mod tests {
delta3,
),
], // delta layers
vec![], // image layers
vec![], // image layers
Lsn(0x50),
)
.await?
@@ -8990,6 +9056,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x10)..Lsn(0x48),
@@ -9540,6 +9607,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
@@ -9787,6 +9855,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
Vec::new(), // in-memory layers
vec![
// delta1 and delta 2 only contain a single key but multiple updates
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
@@ -10022,6 +10091,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![], // delta layers
vec![(Lsn(0x18), img_layer)], // image layers
Lsn(0x18),
@@ -10268,6 +10338,7 @@ mod tests {
baseline_image_layer_lsn,
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
delta_layer_start_lsn..delta_layer_end_lsn,
delta_layer_spec,
@@ -10299,6 +10370,158 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
let harness =
TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
let (tenant, ctx) = harness.load().await;
let will_init_keys = [2, 6];
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let mut expected_key_values = HashMap::new();
let baseline_image_layer_lsn = Lsn(0x10);
let mut baseline_img_layer = Vec::new();
for i in 0..5 {
let key = get_key(i);
let value = format!("value {i}@{baseline_image_layer_lsn}");
let removed = expected_key_values.insert(key, value.clone());
assert!(removed.is_none());
baseline_img_layer.push((key, Bytes::from(value)));
}
let nested_image_layer_lsn = Lsn(0x50);
let mut nested_img_layer = Vec::new();
for i in 5..10 {
let key = get_key(i);
let value = format!("value {i}@{nested_image_layer_lsn}");
let removed = expected_key_values.insert(key, value.clone());
assert!(removed.is_none());
nested_img_layer.push((key, Bytes::from(value)));
}
let frozen_layer = {
let lsn_range = Lsn(0x40)..Lsn(0x60);
let mut data = Vec::new();
for i in 0..10 {
let key = get_key(i);
let key_in_nested = nested_img_layer
.iter()
.any(|(key_with_img, _)| *key_with_img == key);
let lsn = {
if key_in_nested {
Lsn(nested_image_layer_lsn.0 + 5)
} else {
lsn_range.start
}
};
let will_init = will_init_keys.contains(&i);
if will_init {
data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
expected_key_values.insert(key, "".to_string());
} else {
let delta = format!("@{lsn}");
data.push((
key,
lsn,
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
));
expected_key_values
.get_mut(&key)
.expect("An image exists for each key")
.push_str(delta.as_str());
}
}
InMemoryLayerTestDesc {
lsn_range,
is_open: false,
data,
}
};
let (open_layer, last_record_lsn) = {
let start_lsn = Lsn(0x70);
let mut data = Vec::new();
let mut end_lsn = Lsn(0);
for i in 0..10 {
let key = get_key(i);
let lsn = Lsn(start_lsn.0 + i as u64);
let delta = format!("@{lsn}");
data.push((
key,
lsn,
Value::WalRecord(NeonWalRecord::wal_append(&delta)),
));
expected_key_values
.get_mut(&key)
.expect("An image exists for each key")
.push_str(delta.as_str());
end_lsn = std::cmp::max(end_lsn, lsn);
}
(
InMemoryLayerTestDesc {
lsn_range: start_lsn..Lsn::MAX,
is_open: true,
data,
},
end_lsn,
)
};
assert!(
nested_image_layer_lsn > frozen_layer.lsn_range.start
&& nested_image_layer_lsn < frozen_layer.lsn_range.end
);
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
baseline_image_layer_lsn,
DEFAULT_PG_VERSION,
&ctx,
vec![open_layer, frozen_layer], // in-memory layers
Vec::new(), // delta layers
vec![
(baseline_image_layer_lsn, baseline_img_layer),
(nested_image_layer_lsn, nested_img_layer),
], // image layers
last_record_lsn,
)
.await?;
let keyspace = KeySpace::single(get_key(0)..get_key(10));
let results = tline
.get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
.await
.expect("No vectored errors");
for (key, res) in results {
let value = res.expect("No key errors");
let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
assert_eq!(value, Bytes::from(expected_value.clone()));
tracing::info!("key={key} value={expected_value}");
}
Ok(())
}
fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
(
k1.is_delta,
@@ -10414,6 +10637,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -10798,6 +11022,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![
// delta1/2/4 only contain a single key but multiple updates
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
@@ -11049,6 +11274,7 @@ mod tests {
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![
// delta1/2/4 only contain a single key but multiple updates
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),

View File

@@ -382,7 +382,8 @@ pub(crate) mod tests {
}
async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
let (_temp_dir, pathbuf, offsets) =
write_maybe_compressed(blobs, compression, &ctx).await?;

View File

@@ -32,8 +32,7 @@ use hex;
use thiserror::Error;
use tracing::error;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::TaskKind;
use crate::context::RequestContext;
use crate::tenant::block_io::{BlockReader, BlockWriter};
use crate::virtual_file::{IoBuffer, IoBufferMut, owned_buffers_io::write::Buffer};
@@ -478,16 +477,15 @@ where
}
#[allow(dead_code)]
pub async fn dump(&self) -> Result<()> {
pub async fn dump(&self, ctx: &RequestContext) -> Result<()> {
let mut stack = Vec::new();
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
stack.push((self.root_blk, String::new(), 0, 0, 0));
let block_cursor = self.reader.block_cursor();
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?;
let blk = block_cursor.read_blk(self.start_blk + blknum, ctx).await?;
let buf: &[u8] = blk.as_ref();
let node = OnDiskNode::<L>::deparse(buf)?;
@@ -836,6 +834,8 @@ pub(crate) mod tests {
use rand::Rng;
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
#[derive(Clone, Default)]
@@ -870,7 +870,8 @@ pub(crate) mod tests {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
let all_keys: Vec<&[u8; 6]> = vec![
b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
@@ -888,7 +889,7 @@ pub(crate) mod tests {
let reader = DiskBtreeReader::new(0, root_offset, disk);
reader.dump().await?;
reader.dump(&ctx).await?;
// Test the `get` function on all the keys.
for (key, val) in all_data.iter() {
@@ -980,7 +981,8 @@ pub(crate) mod tests {
async fn lots_of_keys() -> Result<()> {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
const NUM_KEYS: u64 = 1000;
@@ -998,7 +1000,7 @@ pub(crate) mod tests {
let reader = DiskBtreeReader::new(0, root_offset, disk);
reader.dump().await?;
reader.dump(&ctx).await?;
use std::sync::Mutex;
@@ -1168,7 +1170,8 @@ pub(crate) mod tests {
// Build a tree from it
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
for (key, val) in disk_btree_test_data::TEST_DATA {
writer.append(&key, val)?;
@@ -1199,7 +1202,7 @@ pub(crate) mod tests {
.await?;
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
reader.dump().await?;
reader.dump(&ctx).await?;
Ok(())
}

View File

@@ -352,7 +352,8 @@ mod tests {
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
Ok((conf, tenant_shard_id, timeline_id, ctx))
}

File diff suppressed because it is too large Load Diff

View File

@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
/// The latest state
head: LayerCoverageTuple<Value>,
/// TODO: this could be an ordered vec using binary search.
/// We push into this map everytime we add a layer, so might see some benefit
/// All previous states
historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
}
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
buffer: BTreeMap<LayerKey, Option<Value>>,
/// All current layers. This is not used for search. Only to make rebuilds easier.
// TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
// [`Self::historic_coverage`] instead of doubling memory usage.
// [`Self::len`]: can require rebuild and serve from latest historic
// [`Self::iter`]: already requires rebuild => can serve from latest historic
layers: BTreeMap<LayerKey, Value>,
}

View File

@@ -194,7 +194,7 @@ pub(crate) use download::{
};
use index::GcCompactionState;
pub(crate) use index::LayerFileMetadata;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use regex::Regex;
use remote_storage::{
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {
/// Initialize the upload queue for the case where the remote storage is empty,
/// i.e., it doesn't have an `IndexPart`.
///
/// `rel_size_v2_status` needs to be carried over during branching, and that's why
/// it's passed in here.
pub fn init_upload_queue_for_empty_remote(
&self,
local_metadata: &TimelineMetadata,
rel_size_v2_status: Option<RelSizeMigration>,
) -> anyhow::Result<()> {
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
// certainly no point in starting more upload tasks than this.
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
.as_ref()
.map_or(0, |r| r.concurrency_limit());
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
let initialized_queue =
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
self.update_remote_physical_size_gauge(None);
info!("initialized upload queue as empty");
Ok(())
@@ -900,7 +906,7 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
/// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
self: &Arc<Self>,
gc_compaction_state: GcCompactionState,
@@ -912,6 +918,21 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
self: &Arc<Self>,
rel_size_v2_status: RelSizeMigration,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
// TODO: allow this operation to bypass the validation check because we might upload the index part
// with no layers but the flag updated. For now, we just modify the index part in memory and the next
// upload will include the flag.
// self.schedule_index_upload(upload_queue);
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///
@@ -933,6 +954,14 @@ impl RemoteTimelineClient {
Ok(())
}
/// Only used in the `patch_index_part` HTTP API to force trigger an index upload.
pub fn force_schedule_index_upload(self: &Arc<Self>) -> Result<(), NotInitialized> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_index_upload(upload_queue);
Ok(())
}
/// Launch an index-file upload operation in the background (internal function)
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();

View File

@@ -7,6 +7,7 @@ use std::collections::HashMap;
use chrono::NaiveDateTime;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::RelSizeMigration;
use pageserver_api::shard::ShardIndex;
use serde::{Deserialize, Serialize};
use utils::id::TimelineId;
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
pub(crate) last_completed_lsn: Lsn,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum RelSizeMigration {
/// The tenant is using the old rel_size format.
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
Legacy,
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
/// persisted in the index part. The read path will read both formats and merge them.
Migrating,
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
/// in the index part, and the read path will not read the old format.
Migrated,
}
impl IndexPart {
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
/// used to understand later versions.

View File

@@ -491,7 +491,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
let remote_storage = self.remote_storage.clone();
let conf = self.tenant_manager.get_conf();
let tenant_shard_id = *secondary_state.get_tenant_shard_id();
let download_ctx = self.root_ctx.attached_child();
let download_ctx = self
.root_ctx
.attached_child()
.with_scope_secondary_tenant(&tenant_shard_id);
(RunningDownload { barrier }, Box::pin(async move {
let _completion = completion;
@@ -771,6 +774,7 @@ impl<'a> TenantDownloader<'a> {
// Download the layers in the heatmap
for timeline in heatmap.timelines {
let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id);
let timeline_state = timeline_states
.remove(&timeline.timeline_id)
.expect("Just populated above");
@@ -869,8 +873,7 @@ impl<'a> TenantDownloader<'a> {
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
let layers_in_heatmap = heatmap_timeline
.layers
.iter()
.hot_layers()
.map(|l| (&l.name, l.metadata.generation))
.collect::<HashSet<_>>();
let layers_on_disk = timeline_state
@@ -1015,7 +1018,8 @@ impl<'a> TenantDownloader<'a> {
// Accumulate updates to the state
let mut touched = Vec::new();
for layer in timeline.layers {
let timeline_id = timeline.timeline_id;
for layer in timeline.into_hot_layers() {
if self.secondary_state.cancel.is_cancelled() {
tracing::debug!("Cancelled -- dropping out of layer loop");
return (Err(UpdateError::Cancelled), touched);
@@ -1040,7 +1044,7 @@ impl<'a> TenantDownloader<'a> {
}
match self
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
.download_layer(tenant_shard_id, &timeline_id, layer, ctx)
.await
{
Ok(Some(layer)) => touched.push(layer),
@@ -1148,7 +1152,7 @@ impl<'a> TenantDownloader<'a> {
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
let timeline_id = timeline.timeline_id;
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());
let (result, touched) = self
.download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
@@ -1316,11 +1320,11 @@ async fn init_timeline_state(
// As we iterate through layers found on disk, we will look up their metadata from this map.
// Layers not present in metadata will be discarded.
let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
heatmap.hot_layers().map(|l| (&l.name, l)).collect();
let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
if let Some(last_heatmap) = last_heatmap {
last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
} else {
HashMap::new()
};

View File

@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(crate) timeline_id: TimelineId,
pub(crate) layers: Vec<HeatMapLayer>,
layers: Vec<HeatMapLayer>,
}
#[serde_as]
@@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer {
#[serde_as(as = "TimestampSeconds<i64>")]
pub(crate) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
#[serde(default)]
pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}
impl HeatMapLayer {
@@ -62,11 +64,13 @@ impl HeatMapLayer {
name: LayerName,
metadata: LayerFileMetadata,
access_time: SystemTime,
cold: bool,
) -> Self {
Self {
name,
metadata,
access_time,
cold,
}
}
}
@@ -78,6 +82,18 @@ impl HeatMapTimeline {
layers,
}
}
pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
self.layers.into_iter().filter(|l| !l.cold)
}
pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
self.layers.iter().filter(|l| !l.cold)
}
pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
self.layers.iter()
}
}
pub(crate) struct HeatMapStats {
@@ -92,7 +108,7 @@ impl HeatMapTenant {
layers: 0,
};
for timeline in &self.timelines {
for layer in &timeline.layers {
for layer in timeline.hot_layers() {
stats.layers += 1;
stats.bytes += layer.metadata.file_size;
}

View File

@@ -474,7 +474,7 @@ async fn fill_logical_sizes(
if cached_size.is_none() {
let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
let parallel_size_calcs = Arc::clone(limit);
let ctx = ctx.attached_child();
let ctx = ctx.attached_child().with_scope_timeline(&timeline);
joinset.spawn(
calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
.in_current_span(),

View File

@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;
use self::inmemory_layer::InMemoryLayerFileId;
use super::PageReconstructError;
use super::layer_map::InMemoryLayerDesc;
use super::timeline::{GetVectoredError, ReadPath};
use crate::config::PageServerConf;
use crate::context::{AccessStatsBehavior, RequestContext};
@@ -721,6 +722,12 @@ struct LayerToVisitId {
lsn_floor: Lsn,
}
#[derive(Debug, PartialEq, Eq, Hash)]
pub enum ReadableLayerWeak {
PersistentLayer(Arc<PersistentLayerDesc>),
InMemoryLayer(InMemoryLayerDesc),
}
/// Layer wrapper for the read path. Note that it is valid
/// to use these layers even after external operations have
/// been performed on them (compaction, freeze, etc.).
@@ -873,7 +880,7 @@ impl ReadableLayer {
}
ReadableLayer::InMemoryLayer(layer) => {
layer
.get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
.await
}
}

View File

@@ -1385,7 +1385,7 @@ impl DeltaLayerInner {
block_reader,
);
tree_reader.dump().await?;
tree_reader.dump(ctx).await?;
let keys = self.index_entries(ctx).await?;
@@ -2024,6 +2024,7 @@ pub(crate) mod test {
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
.await
.unwrap();
let ctx = &ctx.with_scope_timeline(&timeline);
let initdb_layer = timeline
.layers
@@ -2136,7 +2137,7 @@ pub(crate) mod test {
.await
.unwrap();
let new_layer = new_layer.download_and_keep_resident().await.unwrap();
let new_layer = new_layer.download_and_keep_resident(ctx).await.unwrap();
new_layer
.copy_delta_prefix(&mut writer, truncate_at, ctx)

View File

@@ -208,7 +208,7 @@ impl ImageLayerInner {
block_reader,
);
tree_reader.dump().await?;
tree_reader.dump(ctx).await?;
tree_reader
.visit(

View File

@@ -416,7 +416,7 @@ impl InMemoryLayer {
pub(crate) async fn get_values_reconstruct_data(
self: &Arc<InMemoryLayer>,
keyspace: KeySpace,
end_lsn: Lsn,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
@@ -433,8 +433,6 @@ impl InMemoryLayer {
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
let lsn_range = self.start_lsn..end_lsn;
for range in keyspace.ranges.iter() {
for (key, vec_map) in inner
.index

View File

@@ -324,16 +324,16 @@ impl Layer {
reconstruct_data: &mut ValuesReconstructState,
ctx: &RequestContext,
) -> Result<(), GetVectoredError> {
let downloaded = self
.0
.get_or_maybe_download(true, Some(ctx))
.await
.map_err(|err| match err {
DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
GetVectoredError::Cancelled
}
other => GetVectoredError::Other(anyhow::anyhow!(other)),
})?;
let downloaded =
self.0
.get_or_maybe_download(true, ctx)
.await
.map_err(|err| match err {
DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
GetVectoredError::Cancelled
}
other => GetVectoredError::Other(anyhow::anyhow!(other)),
})?;
let this = ResidentLayer {
downloaded: downloaded.clone(),
owner: self.clone(),
@@ -356,8 +356,8 @@ impl Layer {
/// Download the layer if evicted.
///
/// Will not error when the layer is already downloaded.
pub(crate) async fn download(&self) -> Result<(), DownloadError> {
self.0.get_or_maybe_download(true, None).await?;
pub(crate) async fn download(&self, ctx: &RequestContext) -> Result<(), DownloadError> {
self.0.get_or_maybe_download(true, ctx).await?;
Ok(())
}
@@ -392,8 +392,11 @@ impl Layer {
}
/// Downloads if necessary and creates a guard, which will keep this layer from being evicted.
pub(crate) async fn download_and_keep_resident(&self) -> Result<ResidentLayer, DownloadError> {
let downloaded = self.0.get_or_maybe_download(true, None).await?;
pub(crate) async fn download_and_keep_resident(
&self,
ctx: &RequestContext,
) -> Result<ResidentLayer, DownloadError> {
let downloaded = self.0.get_or_maybe_download(true, ctx).await?;
Ok(ResidentLayer {
downloaded,
@@ -446,7 +449,7 @@ impl Layer {
if verbose {
// for now, unconditionally download everything, even if that might not be wanted.
let l = self.0.get_or_maybe_download(true, Some(ctx)).await?;
let l = self.0.get_or_maybe_download(true, ctx).await?;
l.dump(&self.0, ctx).await?
}
@@ -945,7 +948,7 @@ impl LayerInner {
async fn get_or_maybe_download(
self: &Arc<Self>,
allow_download: bool,
ctx: Option<&RequestContext>,
ctx: &RequestContext,
) -> Result<Arc<DownloadedLayer>, DownloadError> {
let (weak, permit) = {
// get_or_init_detached can:
@@ -1035,21 +1038,14 @@ impl LayerInner {
return Err(DownloadError::NotFile(ft));
}
if let Some(ctx) = ctx {
self.check_expected_download(ctx)?;
}
self.check_expected_download(ctx)?;
if !allow_download {
// this is only used from tests, but it is hard to test without the boolean
return Err(DownloadError::DownloadRequired);
}
let download_ctx = ctx
.map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download))
.unwrap_or(RequestContext::new(
TaskKind::LayerDownload,
DownloadBehavior::Download,
));
let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download);
async move {
tracing::info!(%reason, "downloading on-demand");
@@ -1567,10 +1563,10 @@ impl LayerInner {
self.access_stats.record_residence_event();
self.status.as_ref().unwrap().send_replace(Status::Evicted);
*self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
self.status.as_ref().unwrap().send_replace(Status::Evicted);
Ok(())
}

View File

@@ -8,7 +8,6 @@ use utils::id::TimelineId;
use super::failpoints::{Failpoint, FailpointKind};
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::harness::{TenantHarness, test_img};
use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};
@@ -27,11 +26,9 @@ async fn smoke_test() {
let h = TenantHarness::create("smoke_test").await.unwrap();
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let (tenant, _) = h.load().await;
let (tenant, ctx) = h.load().await;
let io_concurrency = IoConcurrency::spawn_for_test();
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
let image_layers = vec![(
Lsn(0x40),
vec![(
@@ -49,12 +46,14 @@ async fn smoke_test() {
Lsn(0x10),
14,
&ctx,
Default::default(), // in-memory layers
Default::default(),
image_layers,
Lsn(0x100),
)
.await
.unwrap();
let ctx = &ctx.with_scope_timeline(&timeline);
// Grab one of the timeline's layers to exercise in the test, and the other layer that is just
// there to avoid the timeline being illegally empty
@@ -93,7 +92,7 @@ async fn smoke_test() {
controlfile_keyspace.clone(),
Lsn(0x10)..Lsn(0x11),
&mut data,
&ctx,
ctx,
)
.await
.unwrap();
@@ -128,7 +127,7 @@ async fn smoke_test() {
controlfile_keyspace.clone(),
Lsn(0x10)..Lsn(0x11),
&mut data,
&ctx,
ctx,
)
.instrument(download_span.clone())
.await
@@ -178,7 +177,7 @@ async fn smoke_test() {
// plain downloading is rarely needed
layer
.download_and_keep_resident()
.download_and_keep_resident(ctx)
.instrument(download_span)
.await
.unwrap();
@@ -340,6 +339,7 @@ fn read_wins_pending_eviction() {
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let ctx = ctx.with_scope_timeline(&timeline);
let layer = {
let mut layers = {
@@ -379,7 +379,7 @@ fn read_wins_pending_eviction() {
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
layer
.0
.get_or_maybe_download(false, None)
.get_or_maybe_download(false, &ctx)
.instrument(download_span)
.await
.expect("should had reinitialized without downloading");
@@ -472,6 +472,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let ctx = ctx.with_scope_timeline(&timeline);
let layer = {
let mut layers = {
@@ -514,7 +515,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
// because no actual eviction happened, we get to just reinitialize the DownloadedLayer
layer
.0
.get_or_maybe_download(false, None)
.get_or_maybe_download(false, &ctx)
.instrument(download_span)
.await
.expect("should had reinitialized without downloading");
@@ -641,7 +642,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let ctx = ctx.with_scope_timeline(&timeline);
// This test does downloads
let ctx = RequestContextBuilder::extend(&ctx)
.download_behavior(DownloadBehavior::Download)
.build();
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
@@ -674,7 +680,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
// simulate a cancelled read which is cancelled before it gets to re-initialize
let e = layer
.0
.get_or_maybe_download(false, None)
.get_or_maybe_download(false, &ctx)
.await
.unwrap_err();
assert!(
@@ -698,7 +704,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
// failpoint is still enabled, but it is not hit
let e = layer
.0
.get_or_maybe_download(false, None)
.get_or_maybe_download(false, &ctx)
.await
.unwrap_err();
assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");
@@ -721,6 +727,12 @@ async fn evict_and_wait_does_not_wait_for_download() {
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let ctx = ctx.with_scope_timeline(&timeline);
// This test does downloads
let ctx = RequestContextBuilder::extend(&ctx)
.download_behavior(DownloadBehavior::Download)
.build();
let layer = {
let mut layers = {
@@ -768,7 +780,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
let mut download = std::pin::pin!(
layer
.0
.get_or_maybe_download(true, None)
.get_or_maybe_download(true, &ctx)
.instrument(download_span)
);

View File

@@ -289,15 +289,14 @@ fn log_compaction_error(
) {
use CompactionError::*;
use crate::pgdatadir_mapping::CollectKeySpaceError;
use crate::tenant::PageReconstructError;
use crate::tenant::upload_queue::NotInitialized;
let level = match err {
e if e.is_cancel() => return,
ShuttingDown => return,
Offload(_) => Level::ERROR,
AlreadyRunning(_) => Level::ERROR,
CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
CollectKeySpaceError(_) => Level::ERROR,
_ if task_cancelled => Level::INFO,
Other(err) => {
@@ -474,21 +473,15 @@ async fn wait_for_active_tenant(
}
let mut update_rx = tenant.subscribe_for_state_updates();
loop {
tokio::select! {
_ = cancel.cancelled() => return ControlFlow::Break(()),
result = update_rx.changed() => if result.is_err() {
tokio::select! {
result = update_rx.wait_for(|s| s == &TenantState::Active) => {
if result.is_err() {
return ControlFlow::Break(());
}
}
match &*update_rx.borrow() {
TenantState::Active => {
debug!("Tenant state changed to active, continuing the task loop");
return ControlFlow::Continue(());
}
state => debug!("Not running the task loop, tenant is not active: {state:?}"),
}
debug!("Tenant state changed to active, continuing the task loop");
ControlFlow::Continue(())
},
_ = cancel.cancelled() => ControlFlow::Break(()),
}
}

View File

@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
use pageserver_api::models::{
CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
};
use pageserver_api::reltag::{BlockNumber, RelTag};
use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
@@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate,
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::l0_flush::{self, L0FlushGlobalState};
use crate::metrics::{
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
};
use crate::page_service::TenantManagerTypes;
use crate::pgdatadir_mapping::{
@@ -286,7 +287,7 @@ pub struct Timeline {
// The LSN of gc-compaction that was last applied to this timeline.
gc_compaction_state: ArcSwap<Option<GcCompactionState>>,
pub(super) metrics: TimelineMetrics,
pub(crate) metrics: Arc<TimelineMetrics>,
// `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code
// in `crate::page_service` writes these metrics.
@@ -436,12 +437,16 @@ pub struct Timeline {
/// May host a background Tokio task which downloads all the layers from the current
/// heatmap on demand.
heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
}
pub(crate) enum PreviousHeatmap {
Active {
heatmap: HeatMapTimeline,
read_at: std::time::Instant,
// End LSN covered by the heatmap if known
end_lsn: Option<Lsn>,
},
Obsolete,
}
@@ -1326,10 +1331,6 @@ impl Timeline {
// (this is a requirement, not a bug). Skip updating the metric in these cases
// to avoid infinite results.
if !results.is_empty() {
// Record the total number of layers visited towards each key in the batch. While some
// layers may not intersect with a given read, and the cost of layer visits are
// amortized across the batch, each visited layer contributes directly to the observed
// latency for every read in the batch, which is what we care about.
if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
static LOG_PACER: Lazy<Mutex<RateLimit>> =
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
@@ -1344,9 +1345,23 @@ impl Timeline {
});
}
// Records the number of layers visited in a few different ways:
//
// * LAYERS_PER_READ: all layers count towards every read in the batch, because each
// layer directly affects its observed latency.
//
// * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
// layer visits and access cost.
//
// * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
// read amplification after batching.
let layers_visited = layers_visited as f64;
let avg_layers_visited = layers_visited / results.len() as f64;
LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
for _ in &results {
self.metrics.layers_per_read.observe(layers_visited as f64);
LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
self.metrics.layers_per_read.observe(layers_visited);
LAYERS_PER_READ_GLOBAL.observe(layers_visited);
LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
}
}
@@ -1864,16 +1879,25 @@ impl Timeline {
};
// Signal compaction failure to avoid L0 flush stalls when it's broken.
match result {
match &result {
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
Err(e) if e.is_cancel() => {}
Err(CompactionError::ShuttingDown) => {
// Covered by the `Err(e) if e.is_cancel()` branch.
}
Err(CompactionError::AlreadyRunning(_)) => {
// Covered by the `Err(e) if e.is_cancel()` branch.
}
Err(CompactionError::Other(_)) => {
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
}
Err(CompactionError::CollectKeySpaceError(_)) => {
// Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
}
// Don't change the current value on offload failure or shutdown. We don't want to
// abruptly stall nor resume L0 flushes in these cases.
Err(CompactionError::Offload(_)) => {}
Err(CompactionError::ShuttingDown) => {}
Err(CompactionError::AlreadyRunning(_)) => {}
};
result
@@ -2188,6 +2212,7 @@ impl Timeline {
pub(crate) async fn download_layer(
&self,
layer_file_name: &LayerName,
ctx: &RequestContext,
) -> Result<Option<bool>, super::storage_layer::layer::DownloadError> {
let Some(layer) = self
.find_layer(layer_file_name)
@@ -2201,7 +2226,7 @@ impl Timeline {
return Ok(None);
};
layer.download().await?;
layer.download(ctx).await?;
Ok(Some(true))
}
@@ -2356,6 +2381,9 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
/// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
/// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
/// possible that the index part persists the state while the config doesn't get persisted.
pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2364,6 +2392,14 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
}
pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
self.rel_size_v2_status
.load()
.as_ref()
.map(|s| s.as_ref().clone())
.unwrap_or(RelSizeMigration::Legacy)
}
fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2624,6 +2660,7 @@ impl Timeline {
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
create_idempotency: crate::tenant::CreateTimelineIdempotency,
gc_compaction_state: Option<GcCompactionState>,
rel_size_v2_status: Option<RelSizeMigration>,
cancel: CancellationToken,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2648,14 +2685,14 @@ impl Timeline {
}
Arc::new_cyclic(|myself| {
let metrics = TimelineMetrics::new(
let metrics = Arc::new(TimelineMetrics::new(
&tenant_shard_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
evictions_low_residence_duration_metric_threshold,
),
);
));
let aux_file_metrics = metrics.aux_file_size_gauge.clone();
let mut result = Timeline {
@@ -2782,6 +2819,8 @@ impl Timeline {
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
heatmap_layers_downloader: Mutex::new(None),
rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
};
result.repartition_threshold =
@@ -2837,7 +2876,7 @@ impl Timeline {
"layer flush task",
async move {
let _guard = guard;
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error).with_scope_timeline(&self_clone);
self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
@@ -2858,6 +2897,16 @@ impl Timeline {
.schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
}
pub(crate) fn update_rel_size_v2_status(
&self,
rel_size_v2_status: RelSizeMigration,
) -> anyhow::Result<()> {
self.rel_size_v2_status
.store(Some(Arc::new(rel_size_v2_status.clone())));
self.remote_client
.schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
}
pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
self.gc_compaction_state.load_full().as_ref().clone()
}
@@ -3560,12 +3609,16 @@ impl Timeline {
Ok(layer)
}
pub(super) fn is_previous_heatmap_active(&self) -> bool {
self.previous_heatmap
.load()
.as_ref()
.map(|prev| matches!(**prev, PreviousHeatmap::Active { .. }))
.unwrap_or(false)
pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool {
let crnt = self.previous_heatmap.load();
match crnt.as_deref() {
Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn {
Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn,
None => true,
},
Some(PreviousHeatmap::Obsolete) => false,
None => false,
}
}
/// The timeline heatmap is a hint to secondary locations from the primary location,
@@ -3593,26 +3646,26 @@ impl Timeline {
// heatamp.
let previous_heatmap = self.previous_heatmap.load();
let visible_non_resident = match previous_heatmap.as_deref() {
Some(PreviousHeatmap::Active { heatmap, read_at }) => {
Some(heatmap.layers.iter().filter_map(|hl| {
let desc: PersistentLayerDesc = hl.name.clone().into();
let layer = guard.try_get_from_key(&desc.key())?;
Some(PreviousHeatmap::Active {
heatmap, read_at, ..
}) => Some(heatmap.all_layers().filter_map(|hl| {
let desc: PersistentLayerDesc = hl.name.clone().into();
let layer = guard.try_get_from_key(&desc.key())?;
if layer.visibility() == LayerVisibilityHint::Covered {
return None;
}
if layer.visibility() == LayerVisibilityHint::Covered {
return None;
}
if layer.is_likely_resident() {
return None;
}
if layer.is_likely_resident() {
return None;
}
if layer.last_evicted_at().happened_after(*read_at) {
return None;
}
if layer.last_evicted_at().happened_after(*read_at) {
return None;
}
Some((desc, hl.metadata.clone(), hl.access_time))
}))
}
Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
})),
Some(PreviousHeatmap::Obsolete) => None,
None => None,
};
@@ -3627,6 +3680,7 @@ impl Timeline {
layer.layer_desc().clone(),
layer.metadata(),
last_activity_ts,
false, // these layers are not cold
))
}
LayerVisibilityHint::Covered => {
@@ -3653,12 +3707,14 @@ impl Timeline {
// Sort layers in order of which to download first. For a large set of layers to download, we
// want to prioritize those layers which are most likely to still be in the resident many minutes
// or hours later:
// - Cold layers go last for convenience when a human inspects the heatmap.
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
// only exist for a few minutes before being compacted into L1s.
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
// the layer is likely to be covered by an image layer during compaction.
layers.sort_by_key(|(desc, _meta, _atime)| {
layers.sort_by_key(|(desc, _meta, _atime, cold)| {
std::cmp::Reverse((
*cold,
!LayerMap::is_l0(&desc.key_range, desc.is_delta),
desc.lsn_range.end,
))
@@ -3666,7 +3722,9 @@ impl Timeline {
let layers = layers
.into_iter()
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
.map(|(desc, meta, atime, cold)| {
HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
})
.collect();
Some(HeatMapTimeline::new(self.timeline_id, layers))
@@ -3686,6 +3744,7 @@ impl Timeline {
name: vl.layer_desc().layer_name(),
metadata: vl.metadata(),
access_time: now,
cold: true,
};
heatmap_layers.push(hl);
}
@@ -3699,6 +3758,7 @@ impl Timeline {
PreviousHeatmap::Active {
heatmap,
read_at: Instant::now(),
end_lsn: Some(end_lsn),
}
}
@@ -3897,39 +3957,22 @@ impl Timeline {
let guard = timeline.layers.read().await;
let layers = guard.layer_map()?;
let in_memory_layer = layers.find_in_memory_layer(|l| {
let start_lsn = l.get_lsn_range().start;
cont_lsn > start_lsn
});
for range in unmapped_keyspace.ranges.iter() {
let results = layers.range_search(range.clone(), cont_lsn);
match in_memory_layer {
Some(l) => {
let lsn_range = l.get_lsn_range().start..cont_lsn;
fringe.update(
ReadableLayer::InMemoryLayer(l),
unmapped_keyspace.clone(),
lsn_range,
);
}
None => {
for range in unmapped_keyspace.ranges.iter() {
let results = layers.range_search(range.clone(), cont_lsn);
results
.found
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
}
}
results
.found
.into_iter()
.map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
(
guard.upgrade(layer),
keyspace_accum.to_keyspace(),
lsn_floor..cont_lsn,
)
})
.for_each(|(layer, keyspace, lsn_range)| {
fringe.update(layer, keyspace, lsn_range)
});
}
// It's safe to drop the layer map lock after planning the next round of reads.
@@ -4202,10 +4245,6 @@ impl Timeline {
// Stall flushes to backpressure if compaction can't keep up. This is propagated up
// to WAL ingestion by having ephemeral layer rolls wait for flushes.
//
// NB: the compaction loop only checks `compaction_threshold` every 20 seconds, so
// we can end up stalling before compaction even starts. Consider making it more
// responsive (e.g. via `watch_level0_deltas`).
if let Some(stall_threshold) = self.get_l0_flush_stall_threshold() {
if l0_count >= stall_threshold {
warn!(
@@ -4693,10 +4732,7 @@ impl Timeline {
));
}
let (dense_ks, sparse_ks) = self
.collect_keyspace(lsn, ctx)
.await
.map_err(CompactionError::CollectKeySpaceError)?;
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
let sparse_partitioning = SparseKeyPartitioning {
parts: vec![sparse_ks],
@@ -5423,13 +5459,42 @@ pub(crate) enum CompactionError {
Offload(OffloadError),
/// Compaction cannot be done right now; page reconstruction and so on.
#[error("Failed to collect keyspace: {0}")]
CollectKeySpaceError(CollectKeySpaceError),
CollectKeySpaceError(#[from] CollectKeySpaceError),
#[error(transparent)]
Other(anyhow::Error),
#[error("Compaction already running: {0}")]
AlreadyRunning(&'static str),
}
impl CompactionError {
/// Errors that can be ignored, i.e., cancel and shutdown.
pub fn is_cancel(&self) -> bool {
matches!(
self,
Self::ShuttingDown
| Self::AlreadyRunning(_)
| Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
| Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
PageReconstructError::Cancelled
))
| Self::Offload(OffloadError::Cancelled)
)
}
/// Critical errors that indicate data corruption.
pub fn is_critical(&self) -> bool {
matches!(
self,
Self::CollectKeySpaceError(
CollectKeySpaceError::Decode(_)
| CollectKeySpaceError::PageRead(
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
)
)
)
}
}
impl From<OffloadError> for CompactionError {
fn from(e: OffloadError) -> Self {
match e {
@@ -5439,18 +5504,6 @@ impl From<OffloadError> for CompactionError {
}
}
impl From<CollectKeySpaceError> for CompactionError {
fn from(err: CollectKeySpaceError) -> Self {
match err {
CollectKeySpaceError::Cancelled
| CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => {
CompactionError::ShuttingDown
}
e => CompactionError::Other(e.into()),
}
}
}
impl From<super::upload_queue::NotInitialized> for CompactionError {
fn from(value: super::upload_queue::NotInitialized) -> Self {
match value {
@@ -5534,6 +5587,14 @@ pub struct DeltaLayerTestDesc {
pub data: Vec<(Key, Lsn, Value)>,
}
#[cfg(test)]
#[derive(Clone)]
pub struct InMemoryLayerTestDesc {
pub lsn_range: Range<Lsn>,
pub data: Vec<(Key, Lsn, Value)>,
pub is_open: bool,
}
#[cfg(test)]
impl DeltaLayerTestDesc {
pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
@@ -6193,6 +6254,7 @@ impl Timeline {
pub(crate) async fn spawn_download_all_remote_layers(
self: Arc<Self>,
request: DownloadRemoteLayersTaskSpawnRequest,
ctx: &RequestContext,
) -> Result<DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskInfo> {
use pageserver_api::models::DownloadRemoteLayersTaskState;
@@ -6213,6 +6275,10 @@ impl Timeline {
}
let self_clone = Arc::clone(&self);
let task_ctx = ctx.detached_child(
TaskKind::DownloadAllRemoteLayers,
DownloadBehavior::Download,
);
let task_id = task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::DownloadAllRemoteLayers,
@@ -6220,7 +6286,7 @@ impl Timeline {
Some(self.timeline_id),
"download all remote layers task",
async move {
self_clone.download_all_remote_layers(request).await;
self_clone.download_all_remote_layers(request, &task_ctx).await;
let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
match &mut *status_guard {
None => {
@@ -6255,6 +6321,7 @@ impl Timeline {
async fn download_all_remote_layers(
self: &Arc<Self>,
request: DownloadRemoteLayersTaskSpawnRequest,
ctx: &RequestContext,
) {
use pageserver_api::models::DownloadRemoteLayersTaskState;
@@ -6311,9 +6378,10 @@ impl Timeline {
let span = tracing::info_span!("download", layer = %next);
let ctx = ctx.attached_child();
js.spawn(
async move {
let res = next.download().await;
let res = next.download(&ctx).await;
(next, res)
}
.instrument(span),
@@ -6541,6 +6609,92 @@ impl Timeline {
Ok(())
}
/// Force create an in-memory layer and place them into the layer map.
#[cfg(test)]
pub(super) async fn force_create_in_memory_layer(
self: &Arc<Timeline>,
mut in_memory: InMemoryLayerTestDesc,
check_start_lsn: Option<Lsn>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
use utils::bin_ser::BeSer;
// Validate LSNs
if let Some(check_start_lsn) = check_start_lsn {
assert!(in_memory.lsn_range.start >= check_start_lsn);
}
let last_record_lsn = self.get_last_record_lsn();
let layer_end_lsn = if in_memory.is_open {
in_memory
.data
.iter()
.map(|(_key, lsn, _value)| lsn)
.max()
.cloned()
} else {
Some(in_memory.lsn_range.end)
};
if let Some(end) = layer_end_lsn {
assert!(
end <= last_record_lsn,
"advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
end,
last_record_lsn,
);
}
in_memory.data.iter().for_each(|(_key, lsn, _value)| {
assert!(*lsn >= in_memory.lsn_range.start);
assert!(*lsn < in_memory.lsn_range.end);
});
// Build the batch
in_memory
.data
.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
let data = in_memory
.data
.into_iter()
.map(|(key, lsn, value)| {
let value_size = value.serialized_size().unwrap() as usize;
(key.to_compact(), lsn, value_size, value)
})
.collect::<Vec<_>>();
let batch = SerializedValueBatch::from_values(data);
// Create the in-memory layer and write the batch into it
let layer = InMemoryLayer::create(
self.conf,
self.timeline_id,
self.tenant_shard_id,
in_memory.lsn_range.start,
&self.gate,
ctx,
)
.await
.unwrap();
layer.put_batch(batch, ctx).await.unwrap();
if !in_memory.is_open {
layer.freeze(in_memory.lsn_range.end).await;
}
info!("force created in-memory layer {:?}", in_memory.lsn_range);
// Link the layer to the layer map
{
let mut guard = self.layers.write().await;
let layer_map = guard.open_mut().unwrap();
layer_map.force_insert_in_memory_layer(Arc::new(layer));
}
Ok(())
}
/// Return all keys at the LSN in the image layers
#[cfg(test)]
pub(crate) async fn inspect_image_layers(
@@ -6900,11 +7054,13 @@ mod tests {
use pageserver_api::key::Key;
use pageserver_api::value::Value;
use std::iter::Iterator;
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use super::HeatMapTimeline;
use crate::context::RequestContextBuilder;
use crate::tenant::harness::{TenantHarness, test_img};
use crate::tenant::layer_map::LayerMap;
use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
@@ -6912,8 +7068,8 @@ mod tests {
use crate::tenant::{PreviousHeatmap, Timeline};
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
assert_eq!(lhs.layers.len(), rhs.layers.len());
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
for (l, r) in lhs_rhs {
assert_eq!(l.name, r.name);
assert_eq!(l.metadata, r.metadata);
@@ -6972,12 +7128,14 @@ mod tests {
Lsn(0x10),
14,
&ctx,
Vec::new(), // in-memory layers
delta_layers,
image_layers,
Lsn(0x100),
)
.await
.unwrap();
let ctx = &ctx.with_scope_timeline(&timeline);
// Layer visibility is an input to heatmap generation, so refresh it first
timeline.update_layer_visibility().await.unwrap();
@@ -6990,10 +7148,11 @@ mod tests {
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
// L0 should come last
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());
let mut last_lsn = Lsn::MAX;
for layer in &heatmap.layers {
for layer in heatmap_layers {
// Covered layer should be omitted
assert!(layer.name != covered_delta.layer_name());
@@ -7026,6 +7185,7 @@ mod tests {
.store(Some(Arc::new(PreviousHeatmap::Active {
heatmap: heatmap.clone(),
read_at: std::time::Instant::now(),
end_lsn: None,
})));
// Generate a new heatmap and assert that it contains the same layers as the old one.
@@ -7041,8 +7201,12 @@ mod tests {
eprintln!("Downloading {layer} and re-generating heatmap");
let ctx = &RequestContextBuilder::extend(ctx)
.download_behavior(crate::context::DownloadBehavior::Download)
.build();
let _resident = layer
.download_and_keep_resident()
.download_and_keep_resident(ctx)
.instrument(tracing::info_span!(
parent: None,
"download_layer",
@@ -7100,6 +7264,7 @@ mod tests {
Lsn(0x10),
14,
&ctx,
Vec::new(), // in-memory layers
delta_layers,
image_layers,
Lsn(0x100),
@@ -7116,7 +7281,7 @@ mod tests {
.expect("Infallible while timeline is not shut down");
// Both layers should be in the heatmap
assert!(!heatmap.layers.is_empty());
assert!(heatmap.all_layers().count() > 0);
// Now simulate a migration.
timeline
@@ -7124,6 +7289,7 @@ mod tests {
.store(Some(Arc::new(PreviousHeatmap::Active {
heatmap: heatmap.clone(),
read_at: std::time::Instant::now(),
end_lsn: None,
})));
// Evict all the layers in the previous heatmap
@@ -7141,7 +7307,7 @@ mod tests {
.await
.expect("Infallible while timeline is not shut down");
assert!(post_eviction_heatmap.layers.is_empty());
assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
assert!(matches!(
timeline.previous_heatmap.load().as_deref(),
Some(PreviousHeatmap::Obsolete)

View File

@@ -7,11 +7,20 @@
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::time::Instant;
use anyhow::{Context, anyhow, bail};
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
Timeline,
};
use anyhow::{Context, anyhow};
use bytes::Bytes;
use enumset::EnumSet;
use fail::fail_point;
use futures::FutureExt;
use itertools::Itertools;
use once_cell::sync::Lazy;
use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
@@ -31,15 +40,8 @@ use utils::critical;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError,
RecordedDuration, Timeline,
};
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::pgdatadir_mapping::CollectKeySpaceError;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::gc_block::GcBlock;
@@ -213,30 +215,39 @@ impl GcCompactionQueue {
}
/// Trigger an auto compaction.
pub async fn trigger_auto_compaction(&self, timeline: &Arc<Timeline>) {
pub async fn trigger_auto_compaction(
&self,
timeline: &Arc<Timeline>,
) -> Result<(), CompactionError> {
let GcCompactionCombinedSettings {
gc_compaction_enabled,
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
} = timeline.get_gc_compaction_settings();
if !gc_compaction_enabled {
return;
return Ok(());
}
if self.remaining_jobs_num() > 0 {
// Only schedule auto compaction when the queue is empty
return;
return Ok(());
}
if timeline.ancestor_timeline().is_some() {
// Do not trigger auto compaction for child timelines. We haven't tested
// it enough in staging yet.
return;
return Ok(());
}
if timeline.get_gc_compaction_watermark() == Lsn::INVALID {
// If the gc watermark is not set, we don't need to trigger auto compaction.
// This check is the same as in `gc_compaction_split_jobs` but we don't log
// here and we can also skip the computation of the trigger condition earlier.
return Ok(());
}
let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
// Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
// the fairness of the lock across timelines. We should listen for both `acquire` and `l0_compaction_trigger`
// to ensure the fairness while avoid starving other tasks.
return;
return Ok(());
};
let gc_compaction_state = timeline.get_gc_compaction_state();
@@ -246,7 +257,7 @@ impl GcCompactionQueue {
let layers = {
let guard = timeline.layers.read().await;
let layer_map = guard.layer_map().unwrap();
let layer_map = guard.layer_map()?;
layer_map.iter_historic_layers().collect_vec()
};
let mut l2_size: u64 = 0;
@@ -318,11 +329,12 @@ impl GcCompactionQueue {
l1_size, l2_size, l2_lsn, gc_cutoff
);
} else {
info!(
debug!(
"did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
l1_size, l2_size, l2_lsn, gc_cutoff
);
}
Ok(())
}
/// Notify the caller the job has finished and unblock GC.
@@ -353,8 +365,7 @@ impl GcCompactionQueue {
GcCompactJob::from_compact_options(options.clone()),
options.sub_compaction_max_job_size_mb,
)
.await
.map_err(CompactionError::Other)?;
.await?;
if jobs.is_empty() {
info!("no jobs to run, skipping scheduled compaction task");
self.notify_and_unblock(id);
@@ -433,6 +444,7 @@ impl GcCompactionQueue {
));
};
let has_pending_tasks;
let mut yield_for_l0 = false;
let Some((id, item)) = ({
let mut guard = self.inner.lock().unwrap();
if let Some((id, item)) = guard.queued.pop_front() {
@@ -444,7 +456,7 @@ impl GcCompactionQueue {
None
}
}) else {
self.trigger_auto_compaction(timeline).await;
self.trigger_auto_compaction(timeline).await?;
// Always yield after triggering auto-compaction. Gc-compaction is a low-priority task and we
// have not implemented preemption mechanism yet. We always want to yield it to more important
// tasks if there is one.
@@ -482,13 +494,23 @@ impl GcCompactionQueue {
let mut guard = self.inner.lock().unwrap();
guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
}
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
let compaction_result =
timeline.compact_with_options(cancel, options, ctx).await?;
self.notify_and_unblock(id);
if compaction_result == CompactionOutcome::YieldForL0 {
yield_for_l0 = true;
}
}
}
GcCompactionQueueItem::SubCompactionJob(options) => {
// TODO: error handling, clear the queue if any task fails?
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
if compaction_result == CompactionOutcome::YieldForL0 {
// We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
// again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
// we need to clean things up before returning from the function.
yield_for_l0 = true;
}
}
GcCompactionQueueItem::Notify(id, l2_lsn) => {
self.notify_and_unblock(id);
@@ -517,7 +539,10 @@ impl GcCompactionQueue {
let mut guard = self.inner.lock().unwrap();
guard.running = None;
}
Ok(if has_pending_tasks {
Ok(if yield_for_l0 {
tracing::info!("give up gc-compaction: yield for L0 compaction");
CompactionOutcome::YieldForL0
} else if has_pending_tasks {
CompactionOutcome::Pending
} else {
CompactionOutcome::Done
@@ -716,17 +741,41 @@ struct CompactionStatisticsNumSize {
#[derive(Debug, Serialize, Default)]
pub struct CompactionStatistics {
/// Delta layer visited (maybe compressed, physical size)
delta_layer_visited: CompactionStatisticsNumSize,
/// Image layer visited (maybe compressed, physical size)
image_layer_visited: CompactionStatisticsNumSize,
/// Delta layer produced (maybe compressed, physical size)
delta_layer_produced: CompactionStatisticsNumSize,
/// Image layer produced (maybe compressed, physical size)
image_layer_produced: CompactionStatisticsNumSize,
num_delta_layer_discarded: usize,
num_image_layer_discarded: usize,
/// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
delta_layer_discarded: CompactionStatisticsNumSize,
/// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
image_layer_discarded: CompactionStatisticsNumSize,
num_unique_keys_visited: usize,
/// Delta visited (uncompressed, original size)
wal_keys_visited: CompactionStatisticsNumSize,
/// Image visited (uncompressed, original size)
image_keys_visited: CompactionStatisticsNumSize,
/// Delta produced (uncompressed, original size)
wal_produced: CompactionStatisticsNumSize,
/// Image produced (uncompressed, original size)
image_produced: CompactionStatisticsNumSize,
// Time spent in each phase
time_acquire_lock_secs: f64,
time_analyze_secs: f64,
time_download_layer_secs: f64,
time_main_loop_secs: f64,
time_final_phase_secs: f64,
time_total_secs: f64,
// Summary
/// Ratio of the key-value size before/after gc-compaction.
uncompressed_size_ratio: f64,
/// Ratio of the physical size before/after gc-compaction.
physical_size_ratio: f64,
}
impl CompactionStatistics {
@@ -776,11 +825,13 @@ impl CompactionStatistics {
self.image_produced.num += 1;
self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
}
fn discard_delta_layer(&mut self) {
self.num_delta_layer_discarded += 1;
fn discard_delta_layer(&mut self, original_size: u64) {
self.delta_layer_discarded.num += 1;
self.delta_layer_discarded.size += original_size;
}
fn discard_image_layer(&mut self) {
self.num_image_layer_discarded += 1;
fn discard_image_layer(&mut self, original_size: u64) {
self.image_layer_discarded.num += 1;
self.image_layer_discarded.size += original_size;
}
fn produce_delta_layer(&mut self, size: u64) {
self.delta_layer_produced.num += 1;
@@ -790,6 +841,19 @@ impl CompactionStatistics {
self.image_layer_produced.num += 1;
self.image_layer_produced.size += size;
}
fn finalize(&mut self) {
let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
self.uncompressed_size_ratio =
original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
let produced_physical_size = self.image_layer_produced.size
+ self.delta_layer_produced.size
+ self.image_layer_discarded.size
+ self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
self.physical_size_ratio =
original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
}
}
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
@@ -822,9 +886,7 @@ impl Timeline {
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
{
self.compact_with_gc(cancel, options, ctx)
.await
.map_err(CompactionError::Other)?;
self.compact_with_gc(cancel, options, ctx).await?;
return Ok(CompactionOutcome::Done);
}
@@ -976,18 +1038,12 @@ impl Timeline {
// Suppress errors when cancelled.
Err(_) if self.cancel.is_cancelled() => {}
Err(CompactionError::ShuttingDown) => {}
Err(CompactionError::CollectKeySpaceError(CollectKeySpaceError::Cancelled)) => {}
Err(err) if err.is_cancel() => {}
// Alert on critical errors that indicate data corruption.
Err(
err @ CompactionError::CollectKeySpaceError(
CollectKeySpaceError::Decode(_)
| CollectKeySpaceError::PageRead(
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
),
),
) => critical!("could not compact, repartitioning keyspace failed: {err:?}"),
Err(err) if err.is_critical() => {
critical!("could not compact, repartitioning keyspace failed: {err:?}");
}
// Log other errors. No partitioning? This is normal, if the timeline was just created
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
@@ -1161,7 +1217,7 @@ impl Timeline {
// - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
// - GC, which at worst witnesses us "undelete" a layer that they just deleted.
// - ingestion, which only inserts layers, therefore cannot collide with us.
let resident = layer.download_and_keep_resident().await?;
let resident = layer.download_and_keep_resident(ctx).await?;
let keys_written = resident
.filter(&self.shard_identity, &mut image_layer_writer, ctx)
@@ -1389,14 +1445,14 @@ impl Timeline {
let mut fully_compacted = true;
deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
deltas_to_compact.push(first_level0_delta.download_and_keep_resident(ctx).await?);
for l in level0_deltas_iter {
let lsn_range = &l.layer_desc().lsn_range;
if lsn_range.start != prev_lsn_end {
break;
}
deltas_to_compact.push(l.download_and_keep_resident().await?);
deltas_to_compact.push(l.download_and_keep_resident(ctx).await?);
deltas_to_compact_bytes += l.metadata().file_size;
prev_lsn_end = lsn_range.end;
@@ -2350,12 +2406,19 @@ impl Timeline {
async fn check_compaction_space(
self: &Arc<Self>,
layer_selection: &[Layer],
) -> anyhow::Result<()> {
let available_space = self.check_available_space().await?;
) -> Result<(), CompactionError> {
let available_space = self
.check_available_space()
.await
.map_err(CompactionError::Other)?;
let mut remote_layer_size = 0;
let mut all_layer_size = 0;
for layer in layer_selection {
let needs_download = layer.needs_download().await?;
let needs_download = layer
.needs_download()
.await
.context("failed to check if layer needs download")
.map_err(CompactionError::Other)?;
if needs_download.is_some() {
remote_layer_size += layer.layer_desc().file_size;
}
@@ -2364,14 +2427,14 @@ impl Timeline {
let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
{
return Err(anyhow!(
return Err(CompactionError::Other(anyhow!(
"not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
available_space,
allocated_space,
all_layer_size,
remote_layer_size,
all_layer_size + remote_layer_size
));
)));
}
Ok(())
}
@@ -2402,7 +2465,7 @@ impl Timeline {
self: &Arc<Self>,
job: GcCompactJob,
sub_compaction_max_job_size_mb: Option<u64>,
) -> anyhow::Result<Vec<GcCompactJob>> {
) -> Result<Vec<GcCompactJob>, CompactionError> {
let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
job.compact_lsn_range.end
} else {
@@ -2553,7 +2616,7 @@ impl Timeline {
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<CompactionOutcome, CompactionError> {
let sub_compaction = options.sub_compaction;
let job = GcCompactJob::from_compact_options(options.clone());
if sub_compaction {
@@ -2575,7 +2638,7 @@ impl Timeline {
if jobs_len == 0 {
info!("no jobs to run, skipping gc bottom-most compaction");
}
return Ok(());
return Ok(CompactionOutcome::Done);
}
self.compact_with_gc_inner(cancel, job, ctx).await
}
@@ -2585,19 +2648,24 @@ impl Timeline {
cancel: &CancellationToken,
job: GcCompactJob,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<CompactionOutcome, CompactionError> {
// Block other compaction/GC tasks from running for now. GC-compaction could run along
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
let timer = Instant::now();
let begin_timer = timer;
let gc_lock = async {
tokio::select! {
guard = self.gc_lock.lock() => Ok(guard),
// TODO: refactor to CompactionError to correctly pass cancelled error
_ = cancel.cancelled() => Err(anyhow!("cancelled")),
_ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
}
};
let time_acquire_lock = timer.elapsed();
let timer = Instant::now();
let gc_lock = crate::timed(
gc_lock,
"acquires gc lock",
@@ -2649,7 +2717,7 @@ impl Timeline {
tracing::warn!(
"no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
);
return Ok(());
return Ok(CompactionOutcome::Skipped);
}
real_gc_cutoff
} else {
@@ -2687,7 +2755,7 @@ impl Timeline {
"no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
gc_cutoff
);
return Ok(());
return Ok(CompactionOutcome::Done);
};
// Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
// the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
@@ -2708,7 +2776,7 @@ impl Timeline {
"no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
compact_lsn_range.end
);
return Ok(());
return Ok(CompactionOutcome::Done);
};
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
// layers to compact.
@@ -2734,7 +2802,7 @@ impl Timeline {
"no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
gc_cutoff, compact_key_range.start, compact_key_range.end
);
return Ok(());
return Ok(CompactionOutcome::Done);
}
retain_lsns_below_horizon.sort();
GcCompactionJobDescription {
@@ -2787,6 +2855,9 @@ impl Timeline {
has_data_below,
);
let time_analyze = timer.elapsed();
let timer = Instant::now();
for layer in &job_desc.selected_layers {
debug!("read layer: {}", layer.layer_desc().key());
}
@@ -2815,10 +2886,10 @@ impl Timeline {
.map(|layer| layer.layer_desc().layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
bail!(
return Err(CompactionError::Other(anyhow!(
"gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
err
);
)));
}
// The maximum LSN we are processing in this compaction loop
let end_lsn = job_desc
@@ -2833,11 +2904,33 @@ impl Timeline {
let mut total_downloaded_size = 0;
let mut total_layer_size = 0;
for layer in &job_desc.selected_layers {
if layer.needs_download().await?.is_some() {
if layer
.needs_download()
.await
.context("failed to check if layer needs download")
.map_err(CompactionError::Other)?
.is_some()
{
total_downloaded_size += layer.layer_desc().file_size;
}
total_layer_size += layer.layer_desc().file_size;
let resident_layer = layer.download_and_keep_resident().await?;
if cancel.is_cancelled() {
return Err(CompactionError::ShuttingDown);
}
let should_yield = self
.l0_compaction_trigger
.notified()
.now_or_never()
.is_some();
if should_yield {
tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
return Ok(CompactionOutcome::YieldForL0);
}
let resident_layer = layer
.download_and_keep_resident(ctx)
.await
.context("failed to download and keep resident layer")
.map_err(CompactionError::Other)?;
downloaded_layers.push(resident_layer);
}
info!(
@@ -2848,19 +2941,36 @@ impl Timeline {
);
for resident_layer in &downloaded_layers {
if resident_layer.layer_desc().is_delta() {
let layer = resident_layer.get_as_delta(ctx).await?;
let layer = resident_layer
.get_as_delta(ctx)
.await
.context("failed to get delta layer")
.map_err(CompactionError::Other)?;
delta_layers.push(layer);
} else {
let layer = resident_layer.get_as_image(ctx).await?;
let layer = resident_layer
.get_as_image(ctx)
.await
.context("failed to get image layer")
.map_err(CompactionError::Other)?;
image_layers.push(layer);
}
}
let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
let (dense_ks, sparse_ks) = self
.collect_gc_compaction_keyspace()
.await
.context("failed to collect gc compaction keyspace")
.map_err(CompactionError::Other)?;
let mut merge_iter = FilterIterator::create(
MergeIterator::create(&delta_layers, &image_layers, ctx),
dense_ks,
sparse_ks,
)?;
)
.context("failed to create filter iterator")
.map_err(CompactionError::Other)?;
let time_download_layer = timer.elapsed();
let timer = Instant::now();
// Step 2: Produce images+deltas.
let mut accumulated_values = Vec::new();
@@ -2880,7 +2990,9 @@ impl Timeline {
&self.gate,
ctx,
)
.await?,
.await
.context("failed to create image layer writer")
.map_err(CompactionError::Other)?,
)
} else {
None
@@ -2893,7 +3005,9 @@ impl Timeline {
lowest_retain_lsn..end_lsn,
self.get_compaction_target_size(),
)
.await?;
.await
.context("failed to create delta layer writer")
.map_err(CompactionError::Other)?;
#[derive(Default)]
struct RewritingLayers {
@@ -2933,9 +3047,28 @@ impl Timeline {
// the key and LSN range are determined. However, to keep things simple here, we still
// create this writer, and discard the writer in the end.
while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
let mut keys_processed = 0;
while let Some(((key, lsn, val), desc)) = merge_iter
.next_with_trace()
.await
.context("failed to get next key-value pair")
.map_err(CompactionError::Other)?
{
if cancel.is_cancelled() {
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
return Err(CompactionError::ShuttingDown);
}
keys_processed += 1;
if keys_processed % 1000 == 0 {
let should_yield = self
.l0_compaction_trigger
.notified()
.now_or_never()
.is_some();
if should_yield {
tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
return Ok(CompactionOutcome::YieldForL0);
}
}
if self.shard_identity.is_key_disposable(&key) {
// If this shard does not need to store this key, simply skip it.
@@ -2967,7 +3100,9 @@ impl Timeline {
&self.gate,
ctx,
)
.await?,
.await
.context("failed to create delta layer writer")
.map_err(CompactionError::Other)?,
);
}
rewriter.before.as_mut().unwrap()
@@ -2983,14 +3118,20 @@ impl Timeline {
&self.gate,
ctx,
)
.await?,
.await
.context("failed to create delta layer writer")
.map_err(CompactionError::Other)?,
);
}
rewriter.after.as_mut().unwrap()
} else {
unreachable!()
};
rewriter.put_value(key, lsn, val, ctx).await?;
rewriter
.put_value(key, lsn, val, ctx)
.await
.context("failed to put value")
.map_err(CompactionError::Other)?;
continue;
}
match val {
@@ -3013,9 +3154,13 @@ impl Timeline {
&job_desc.retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
.await?,
.await
.context("failed to get ancestor image")
.map_err(CompactionError::Other)?,
)
.await?;
.await
.context("failed to generate key retention")
.map_err(CompactionError::Other)?;
retention
.pipe_to(
*last_key,
@@ -3025,7 +3170,9 @@ impl Timeline {
&self.gate,
ctx,
)
.await?;
.await
.context("failed to pipe to delta layer writer")
.map_err(CompactionError::Other)?;
accumulated_values.clear();
*last_key = key;
accumulated_values.push((key, lsn, val));
@@ -3043,9 +3190,14 @@ impl Timeline {
job_desc.gc_cutoff,
&job_desc.retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn)
.await
.context("failed to get ancestor image")
.map_err(CompactionError::Other)?,
)
.await?;
.await
.context("failed to generate key retention")
.map_err(CompactionError::Other)?;
retention
.pipe_to(
last_key,
@@ -3055,21 +3207,36 @@ impl Timeline {
&self.gate,
ctx,
)
.await?;
.await
.context("failed to pipe to delta layer writer")
.map_err(CompactionError::Other)?;
// end: move the above part to the loop body
let time_main_loop = timer.elapsed();
let timer = Instant::now();
let mut rewrote_delta_layers = Vec::new();
for (key, writers) in delta_layer_rewriters {
if let Some(delta_writer_before) = writers.before {
let (desc, path) = delta_writer_before
.finish(job_desc.compaction_key_range.start, ctx)
.await?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
.await
.context("failed to finish delta layer writer")
.map_err(CompactionError::Other)?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)
.context("failed to finish creating delta layer")
.map_err(CompactionError::Other)?;
rewrote_delta_layers.push(layer);
}
if let Some(delta_writer_after) = writers.after {
let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
let (desc, path) = delta_writer_after
.finish(key.key_range.end, ctx)
.await
.context("failed to finish delta layer writer")
.map_err(CompactionError::Other)?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)
.context("failed to finish creating delta layer")
.map_err(CompactionError::Other)?;
rewrote_delta_layers.push(layer);
}
}
@@ -3084,7 +3251,9 @@ impl Timeline {
let end_key = job_desc.compaction_key_range.end;
writer
.finish_with_discard_fn(self, ctx, end_key, discard)
.await?
.await
.context("failed to finish image layer writer")
.map_err(CompactionError::Other)?
} else {
drop(writer);
Vec::new()
@@ -3096,7 +3265,9 @@ impl Timeline {
let produced_delta_layers = if !dry_run {
delta_layer_writer
.finish_with_discard_fn(self, ctx, discard)
.await?
.await
.context("failed to finish delta layer writer")
.map_err(CompactionError::Other)?
} else {
drop(delta_layer_writer);
Vec::new()
@@ -3108,6 +3279,13 @@ impl Timeline {
let mut keep_layers = HashSet::new();
let produced_delta_layers_len = produced_delta_layers.len();
let produced_image_layers_len = produced_image_layers.len();
let layer_selection_by_key = job_desc
.selected_layers
.iter()
.map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
.collect::<HashMap<_, _>>();
for action in produced_delta_layers {
match action {
BatchWriterResult::Produced(layer) => {
@@ -3121,8 +3299,16 @@ impl Timeline {
if cfg!(debug_assertions) {
info!("discarded delta layer: {}", l);
}
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
stat.discard_delta_layer(layer_desc.file_size());
} else {
tracing::warn!(
"discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
l
);
stat.discard_delta_layer(0);
}
keep_layers.insert(l);
stat.discard_delta_layer();
}
}
}
@@ -3131,6 +3317,9 @@ impl Timeline {
"produced rewritten delta layer: {}",
layer.layer_desc().key()
);
// For now, we include rewritten delta layer size in the "produce_delta_layer". We could
// make it a separate statistics in the future.
stat.produce_delta_layer(layer.layer_desc().file_size());
}
compact_to.extend(rewrote_delta_layers);
for action in produced_image_layers {
@@ -3142,8 +3331,16 @@ impl Timeline {
}
BatchWriterResult::Discarded(l) => {
debug!("discarded image layer: {}", l);
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
stat.discard_image_layer(layer_desc.file_size());
} else {
tracing::warn!(
"discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
l
);
stat.discard_image_layer(0);
}
keep_layers.insert(l);
stat.discard_image_layer();
}
}
}
@@ -3176,7 +3373,9 @@ impl Timeline {
&layer.layer_desc().key_range,
&job_desc.compaction_key_range,
) {
bail!("violated constraint: image layer outside of compaction key range");
return Err(CompactionError::Other(anyhow!(
"violated constraint: image layer outside of compaction key range"
)));
}
if !fully_contains(
&job_desc.compaction_key_range,
@@ -3189,13 +3388,25 @@ impl Timeline {
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
let time_final_phase = timer.elapsed();
stat.time_final_phase_secs = time_final_phase.as_secs_f64();
stat.time_main_loop_secs = time_main_loop.as_secs_f64();
stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
stat.time_download_layer_secs = time_download_layer.as_secs_f64();
stat.time_analyze_secs = time_analyze.as_secs_f64();
stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
stat.finalize();
info!(
"gc-compaction statistics: {}",
serde_json::to_string(&stat)?
serde_json::to_string(&stat)
.context("failed to serialize gc-compaction statistics")
.map_err(CompactionError::Other)?
);
if dry_run {
return Ok(());
return Ok(CompactionOutcome::Done);
}
info!(
@@ -3230,10 +3441,10 @@ impl Timeline {
// the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
// in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
if let Some(err) = check_valid_layermap(&final_layers) {
bail!(
return Err(CompactionError::Other(anyhow!(
"gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
err
);
)));
}
// Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
@@ -3285,7 +3496,9 @@ impl Timeline {
// find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
// be batched into `schedule_compaction_update`.
let disk_consistent_lsn = self.disk_consistent_lsn.load();
self.schedule_uploads(disk_consistent_lsn, None)?;
self.schedule_uploads(disk_consistent_lsn, None)
.context("failed to schedule uploads")
.map_err(CompactionError::Other)?;
// If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
// of `compact_from`.
let compact_from = {
@@ -3312,7 +3525,7 @@ impl Timeline {
drop(gc_lock);
Ok(())
Ok(CompactionOutcome::Done)
}
}
@@ -3418,6 +3631,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
async fn downcast_delta_layer(
&self,
layer: &OwnArc<PersistentLayerDesc>,
ctx: &RequestContext,
) -> anyhow::Result<Option<ResidentDeltaLayer>> {
// this is a lot more complex than a simple downcast...
if layer.is_delta() {
@@ -3425,7 +3639,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
let guard = self.timeline.layers.read().await;
guard.get_from_desc(layer)
};
let result = l.download_and_keep_resident().await?;
let result = l.download_and_keep_resident(ctx).await?;
Ok(Some(ResidentDeltaLayer(result)))
} else {

View File

@@ -11,6 +11,7 @@ use utils::id::TimelineId;
use utils::{crashsafe, fs_ext, pausable_failpoint};
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::{
@@ -291,10 +292,11 @@ impl DeleteTimelineFlow {
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: RemoteTimelineClient,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
// RemoteTimelineClient is the only functioning part.
let timeline = tenant
let (timeline, _timeline_ctx) = tenant
.create_timeline_struct(
timeline_id,
local_metadata,
@@ -306,6 +308,8 @@ impl DeleteTimelineFlow {
CreateTimelineCause::Delete,
crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
None, // doesn't matter what we put here
None, // doesn't matter what we put here
ctx,
)
.context("create_timeline_struct")?;

View File

@@ -12,6 +12,7 @@ use utils::completion;
use utils::generation::Generation;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::sync::gate::GateError;
use super::layer_manager::LayerManager;
use super::{FlushLayerError, Timeline};
@@ -363,14 +364,25 @@ pub(super) async fn prepare(
let mut tasks = tokio::task::JoinSet::new();
let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
let cancel_eval = CancellationToken::new();
for adopted in rest_of_historic {
let limiter = limiter.clone();
let timeline = detached.clone();
let cancel_eval = cancel_eval.clone();
tasks.spawn(
async move {
let _permit = limiter.acquire().await;
let _permit = tokio::select! {
permit = limiter.acquire() => {
permit
}
// Wait for the cancellation here instead of letting the entire task be cancelled.
// Cancellations are racy in that they might leave layers on disk.
_ = cancel_eval.cancelled() => {
Err(Error::ShuttingDown)?
}
};
let (owned, did_hardlink) = remote_copy(
&adopted,
&timeline,
@@ -386,7 +398,22 @@ pub(super) async fn prepare(
);
}
fn delete_layers(timeline: &Timeline, layers: Vec<Layer>) -> Result<(), Error> {
// We are deleting layers, so we must hold the gate
let _gate = timeline.gate.enter().map_err(|e| match e {
GateError::GateClosed => Error::ShuttingDown,
})?;
{
layers.into_iter().for_each(|l: Layer| {
l.delete_on_drop();
std::mem::drop(l);
});
}
Ok(())
}
let mut should_fsync = false;
let mut first_err = None;
while let Some(res) = tasks.join_next().await {
match res {
Ok(Ok((owned, did_hardlink))) => {
@@ -395,13 +422,24 @@ pub(super) async fn prepare(
}
new_layers.push(owned);
}
// Don't stop the evaluation on errors, so that we get the full set of hardlinked layers to delete.
Ok(Err(failed)) => {
return Err(failed);
cancel_eval.cancel();
first_err.get_or_insert(failed);
}
Err(je) => {
cancel_eval.cancel();
first_err.get_or_insert(Error::Prepare(je.into()));
}
Err(je) => return Err(Error::Prepare(je.into())),
}
}
if let Some(failed) = first_err {
delete_layers(detached, new_layers)?;
return Err(failed);
}
// fsync directory again if we hardlinked something
if should_fsync {
fsync_timeline_dir(detached, ctx).await;
@@ -592,7 +630,7 @@ async fn copy_lsn_prefix(
.with_context(|| format!("prepare to copy lsn prefix of ancestors {layer}"))
.map_err(Error::Prepare)?;
let resident = layer.download_and_keep_resident().await.map_err(|e| {
let resident = layer.download_and_keep_resident(ctx).await.map_err(|e| {
if e.is_cancelled() {
Error::ShuttingDown
} else {
@@ -650,6 +688,11 @@ async fn remote_copy(
let conf = adoptee.conf;
let file_name = adopted.layer_desc().layer_name();
// We don't want to shut the timeline down during this operation because we do `delete_on_drop` below
let _gate = adoptee.gate.enter().map_err(|e| match e {
GateError::GateClosed => Error::ShuttingDown,
})?;
// depending if Layer::keep_resident, do a hardlink
let did_hardlink;
let owned = if let Some(adopted_resident) = adopted.keep_resident().await {
@@ -661,8 +704,32 @@ async fn remote_copy(
&file_name,
&metadata.generation,
);
std::fs::hard_link(adopted_path, &adoptee_path)
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
match std::fs::hard_link(adopted_path, &adoptee_path) {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
// In theory we should not get into this situation as we are doing cleanups of the layer file after errors.
// However, we don't do cleanups for errors past `prepare`, so there is the slight chance to get to this branch.
// Double check that the file is orphan (probably from an earlier attempt), then delete it
let key = file_name.clone().into();
if adoptee.layers.read().await.contains_key(&key) {
// We are supposed to filter out such cases before coming to this function
return Err(Error::Prepare(anyhow::anyhow!(
"layer file {file_name} already present and inside layer map"
)));
}
tracing::info!("Deleting orphan layer file to make way for hard linking");
// Delete orphan layer file and try again, to ensure this layer has a well understood source
std::fs::remove_file(adopted_path)
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
std::fs::hard_link(adopted_path, &adoptee_path)
.map_err(|e| Error::launder(e.into(), Error::Prepare))?;
}
Err(e) => {
return Err(Error::launder(e.into(), Error::Prepare));
}
};
did_hardlink = true;
Layer::for_resident(conf, adoptee, adoptee_path, file_name, metadata).drop_eviction_guard()
} else {
@@ -670,12 +737,21 @@ async fn remote_copy(
Layer::for_evicted(conf, adoptee, file_name, metadata)
};
let layer = adoptee
let layer = match adoptee
.remote_client
.copy_timeline_layer(adopted, &owned, cancel)
.await
.map(move |()| owned)
.map_err(|e| Error::launder(e, Error::Prepare))?;
{
Ok(()) => owned,
Err(e) => {
{
// Clean up the layer so that on a retry we don't get errors that the file already exists
owned.delete_on_drop();
std::mem::drop(owned);
}
return Err(Error::launder(e, Error::Prepare));
}
};
Ok((layer, did_hardlink))
}

View File

@@ -93,7 +93,8 @@ impl Timeline {
}
}
let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn);
let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn)
.with_scope_timeline(&self);
loop {
let policy = self.get_eviction_policy();
let cf = self

View File

@@ -1,5 +1,4 @@
//! An efficient way to keep the timeline gate open without preventing
//! timeline shutdown for longer than a single call to a timeline method.
//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
//!
//! # Motivation
//!
@@ -19,27 +18,32 @@
//! we hold the Timeline gate open while we're invoking the method on the
//! Timeline object.
//!
//! However, we want to avoid the overhead of entering the gate for every
//! method invocation.
//!
//! Further, for shard routing, we want to avoid calling the tenant manager to
//! resolve the shard for every request. Instead, we want to cache the
//! routing result so we can bypass the tenant manager for all subsequent requests
//! that get routed to that shard.
//! We want to avoid the overhead of doing, for each incoming request,
//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
//! release the mgr rwlock before doing any request processing work
//! - re-entering the Timeline gate for each Timeline method invocation.
//!
//! Regardless of how we accomplish the above, it should not
//! prevent the Timeline from shutting down promptly.
//!
//!
//! # Design
//!
//! ## Data Structures
//!
//! There are three user-facing data structures:
//! There are two concepts expressed as associated types in the `Types` trait:
//! - `TenantManager`: the thing that performs the expensive work. It produces
//! a `Timeline` object, which is the other associated type.
//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
//!
//! There are three user-facing data structures exposed by this module:
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
//! trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
//! trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
//! point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
//!
//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
@@ -64,11 +68,14 @@
//!
//! To dispatch a request, the page service connection calls `Cache::get`.
//!
//! A cache miss means we consult the tenant manager for shard routing,
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
//! A cache miss means we call Types::TenantManager::resolve for shard routing,
//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
//!
//! We wrap the object returned from resolve() in an `Arc` and store that inside the
//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
//! and a strong ref in the `PerTimelineState`.
//! A strong ref is returned wrapped in a `Handle`.
//! Another strong ref is returned wrapped in a `Handle`.
//!
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
//! and find the weak ref in the cache.
@@ -78,51 +85,51 @@
//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
//!
//! # Performance
//!
//! Remember from the introductory section:
//!
//! > However, we want to avoid the overhead of entering the gate for every
//! > method invocation.
//! > We want to avoid the overhead of doing, for each incoming request,
//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
//! > release the mgr rwlock before doing any request processing work
//! > - re-entering the Timeline gate for each Timeline method invocation.
//!
//! Why do we want to avoid that?
//! Because the gate is a shared location in memory and entering it involves
//! bumping refcounts, which leads to cache contention if done frequently
//! from multiple cores in parallel.
//! All of these boil down to some state that is either globally shared among all shards
//! or state shared among all tasks that serve a particular timeline.
//! It is either protected by RwLock or manipulated via atomics.
//! Even atomics are costly when shared across multiple cores.
//! So, we want to avoid any permanent need for coordination between page_service tasks.
//!
//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
//! That `Arc` is private to the `HandleInner` and hence to the connection.
//! The solution is to add indirection: we wrap the Types::Timeline object that is
//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
//! and hence to the single Cache / page_service connection.
//! (Review the "Data Structures" section if that is unclear to you.)
//!
//! A `WeakHandle` is a weak ref to the `HandleInner`.
//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
//!
//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
//! Again, this is cheap because the `Arc` is private to the connection.
//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
//! The Mutex is not contended because it is private to the connection.
//! And again, the `Arc<Types::Timeline>` clone is cheap because that wrapper
//! Arc's refcounts are private to the connection.
//!
//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
//!
//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
//! so that we can clone it cheaply when upgrading a `WeakHandle`.
//!
//! # Shutdown
//!
//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
//!
//! ```text
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
//! ```
//!
//! Further, there is this cycle:
//!
//! ```text
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
//! ```
//!
//! The former cycle is a memory leak if not broken.
@@ -135,9 +142,12 @@
//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
//! - Connection shutdown (=> dropping the `Cache`).
//!
//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
//! `Arc<GateGuard>`.
//! Both transition the `HandleInner` from [`HandleInner::Open`] to
//! [`HandleInner::ShutDown`], which drops the only long-lived
//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
//! is dropped, the `Types::Timeline` gets dropped and thereby
//! the `GateGuard` and the `Arc<Timeline>` that it stores,
//! thereby breaking both cycles.
//!
//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
//! thereby breaking the cycle.
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
pub(crate) trait Types: Sized + std::fmt::Debug {
type TenantManagerError: Sized + std::fmt::Debug;
type TenantManager: TenantManager<Self> + Sized;
type Timeline: ArcTimeline<Self> + Sized;
type Timeline: Timeline<Self> + Sized;
}
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {
/// See module-level comment.
pub(crate) struct Handle<T: Types> {
timeline: Arc<T::Timeline>,
#[allow(dead_code)] // the field exists to keep the gate open
gate_guard: Arc<utils::sync::gate::GateGuard>,
inner: Arc<Mutex<HandleInner<T>>>,
open: Arc<T::Timeline>,
}
pub(crate) struct WeakHandle<T: Types> {
inner: Weak<Mutex<HandleInner<T>>>,
}
enum HandleInner<T: Types> {
KeepingTimelineGateOpen {
#[allow(dead_code)]
gate_guard: Arc<utils::sync::gate::GateGuard>,
timeline: Arc<T::Timeline>,
},
Open(Arc<T::Timeline>),
ShutDown,
}
@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
}
/// Abstract view of an [`Arc<Timeline>`], for testability.
pub(crate) trait ArcTimeline<T: Types>: Clone {
fn gate(&self) -> &utils::sync::gate::Gate;
pub(crate) trait Timeline<T: Types> {
fn shard_timeline_id(&self) -> ShardTimelineId;
fn get_shard_identity(&self) -> &ShardIdentity;
fn per_timeline_state(&self) -> &PerTimelineState<T>;
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
#[derive(Debug)]
pub(crate) enum GetError<T: Types> {
TenantManager(T::TenantManagerError),
TimelineGateClosed,
PerTimelineStateShutDown,
}
@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
}
trace!("creating new HandleInner");
let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
gate_guard: Arc::new(
// this enter() is expensive in production code because
// it hits the global Arc<Timeline>::gate refcounts
match timeline.gate().enter() {
Ok(guard) => guard,
Err(_) => {
return Err(GetError::TimelineGateClosed);
}
},
),
// this clone is expensive in production code because
// it hits the global Arc<Timeline>::clone refcounts
timeline: Arc::new(timeline.clone()),
}));
let timeline = Arc::new(timeline);
let handle_inner_arc =
Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
let handle_weak = WeakHandle {
inner: Arc::downgrade(&handle_inner_arc),
};
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
};
let lock_guard = inner.lock().expect("poisoned");
match &*lock_guard {
HandleInner::KeepingTimelineGateOpen {
timeline,
gate_guard,
} => {
let gate_guard = Arc::clone(gate_guard);
let timeline = Arc::clone(timeline);
HandleInner::Open(open) => {
let open = Arc::clone(open);
drop(lock_guard);
Ok(Handle {
timeline,
gate_guard,
inner,
})
Ok(Handle { open, inner })
}
HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
}
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
impl<T: Types> std::ops::Deref for Handle<T> {
type Target = T::Timeline;
fn deref(&self) -> &Self::Target {
&self.timeline
&self.open
}
}
@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
/// to the [`Types::Timeline`] that embeds this per-timeline state.
/// Even if [`TenantManager::resolve`] would still resolve to it.
///
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
/// That's ok because they're short-lived. See module-level comment for details.
#[instrument(level = "trace", skip_all)]
pub(super) fn shutdown(&self) {
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
impl<T: Types> HandleInner<T> {
fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
match std::mem::replace(self, HandleInner::ShutDown) {
HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
HandleInner::Open(timeline) => Some(timeline),
HandleInner::ShutDown => {
// Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
// may do it concurrently, but locking rules disallow holding per-timeline-state lock and
@@ -631,6 +614,7 @@ mod tests {
use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardStripeSize;
use utils::shard::ShardCount;
use utils::sync::gate::GateGuard;
use super::*;
@@ -641,7 +625,7 @@ mod tests {
impl Types for TestTypes {
type TenantManagerError = anyhow::Error;
type TenantManager = StubManager;
type Timeline = Arc<StubTimeline>;
type Timeline = Entered;
}
struct StubManager {
@@ -656,17 +640,19 @@ mod tests {
myself: Weak<StubTimeline>,
}
struct Entered {
timeline: Arc<StubTimeline>,
#[allow(dead_code)] // it's stored here to keep the gate open
gate_guard: Arc<GateGuard>,
}
impl StubTimeline {
fn getpage(&self) {
// do nothing
}
}
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
fn gate(&self) -> &utils::sync::gate::Gate {
&self.gate
}
impl Timeline<TestTypes> for Entered {
fn shard_timeline_id(&self) -> ShardTimelineId {
ShardTimelineId {
shard_index: self.shard.shard_index(),
@@ -688,20 +674,34 @@ mod tests {
&self,
timeline_id: TimelineId,
shard_selector: ShardSelector,
) -> anyhow::Result<Arc<StubTimeline>> {
) -> anyhow::Result<Entered> {
for timeline in &self.shards {
if timeline.id == timeline_id {
let enter_gate = || {
let gate_guard = timeline.gate.enter()?;
let gate_guard = Arc::new(gate_guard);
anyhow::Ok(gate_guard)
};
match &shard_selector {
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
return Ok(Arc::clone(timeline));
return Ok(Entered {
timeline: Arc::clone(timeline),
gate_guard: enter_gate()?,
});
}
ShardSelector::Zero => continue,
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
return Ok(Arc::clone(timeline));
return Ok(Entered {
timeline: Arc::clone(timeline),
gate_guard: enter_gate()?,
});
}
ShardSelector::Page(_) => continue,
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
return Ok(Arc::clone(timeline));
return Ok(Entered {
timeline: Arc::clone(timeline),
gate_guard: enter_gate()?,
});
}
ShardSelector::Known(_) => continue,
}
@@ -711,6 +711,13 @@ mod tests {
}
}
impl std::ops::Deref for Entered {
type Target = StubTimeline;
fn deref(&self) -> &Self::Target {
&self.timeline
}
}
#[tokio::test(start_paused = true)]
async fn test_timeline_shutdown() {
crate::tenant::harness::setup_logging();
@@ -1038,7 +1045,6 @@ mod tests {
let key = DBDIR_KEY;
// Simulate 10 connections that's opened, used, and closed
let mut used_handles = vec![];
for _ in 0..10 {
let mut cache = Cache::<TestTypes>::default();
let handle = {
@@ -1050,7 +1056,6 @@ mod tests {
handle
};
handle.getpage();
used_handles.push(Arc::downgrade(&handle.timeline));
}
// No handles exist, thus gates are closed and don't require shutdown.

View File

@@ -10,6 +10,8 @@ use http_utils::error::ApiError;
use tokio_util::sync::CancellationToken;
use utils::sync::gate::Gate;
use crate::context::RequestContext;
use super::Timeline;
// This status is not strictly necessary now, but gives us a nice place
@@ -30,6 +32,8 @@ impl HeatmapLayersDownloader {
fn new(
timeline: Arc<Timeline>,
concurrency: usize,
recurse: bool,
ctx: RequestContext,
) -> Result<HeatmapLayersDownloader, ApiError> {
let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;
@@ -57,12 +61,13 @@ impl HeatmapLayersDownloader {
tracing::info!(
resident_size=%timeline.resident_physical_size(),
heatmap_layers=%heatmap.layers.len(),
heatmap_layers=%heatmap.all_layers().count(),
"Starting heatmap layers download"
);
let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
|layer| {
let ctx = ctx.attached_child();
let tl = timeline.clone();
let dl_guard = match downloads_guard.enter() {
Ok(g) => g,
@@ -75,7 +80,7 @@ impl HeatmapLayersDownloader {
Some(async move {
let _dl_guard = dl_guard;
let res = tl.download_layer(&layer.name).await;
let res = tl.download_layer(&layer.name, &ctx).await;
if let Err(err) = res {
if !err.is_cancelled() {
tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}")
@@ -94,6 +99,20 @@ impl HeatmapLayersDownloader {
},
_ = cancel.cancelled() => {
tracing::info!("Heatmap layers download cancelled");
return;
}
}
if recurse {
if let Some(ancestor) = timeline.ancestor_timeline() {
let ctx = ctx.attached_child();
let res =
ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx);
if let Err(err) = res {
tracing::info!(
"Failed to start heatmap layers download for ancestor: {err}"
);
}
}
}
}
@@ -136,13 +155,20 @@ impl HeatmapLayersDownloader {
}
impl Timeline {
pub(crate) async fn start_heatmap_layers_download(
pub(crate) fn start_heatmap_layers_download(
self: &Arc<Self>,
concurrency: usize,
recurse: bool,
ctx: &RequestContext,
) -> Result<(), ApiError> {
let mut locked = self.heatmap_layers_downloader.lock().unwrap();
if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?;
let dl = HeatmapLayersDownloader::new(
self.clone(),
concurrency,
recurse,
ctx.attached_child(),
)?;
*locked = Some(dl);
Ok(())
} else {

View File

@@ -8,14 +8,14 @@ use tracing::trace;
use utils::id::TimelineId;
use utils::lsn::{AtomicLsn, Lsn};
use super::TimelineWriterState;
use super::{ReadableLayer, TimelineWriterState};
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::metrics::TimelineMetrics;
use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
use crate::tenant::storage_layer::{
AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
PersistentLayerKey, ResidentLayer,
PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
};
/// Provides semantic APIs to manipulate the layer map.
@@ -37,6 +37,21 @@ impl Default for LayerManager {
}
impl LayerManager {
pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
match weak {
ReadableLayerWeak::PersistentLayer(desc) => {
ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
}
ReadableLayerWeak::InMemoryLayer(desc) => {
let inmem = self
.layer_map()
.expect("no concurrent shutdown")
.in_memory_layer(&desc);
ReadableLayer::InMemoryLayer(inmem)
}
}
}
pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
// The assumption for the `expect()` is that all code maintains the following invariant:
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
@@ -470,6 +485,25 @@ impl OpenLayerManager {
mapping.remove(layer);
layer.delete_on_drop();
}
#[cfg(test)]
pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
use pageserver_api::models::InMemoryLayerInfo;
match layer.info() {
InMemoryLayerInfo::Open { .. } => {
assert!(self.layer_map.open_layer.is_none());
self.layer_map.open_layer = Some(layer);
}
InMemoryLayerInfo::Frozen { lsn_start, .. } => {
if let Some(last) = self.layer_map.frozen_layers.back() {
assert!(last.get_lsn_range().end <= lsn_start);
}
self.layer_map.frozen_layers.push_back(layer);
}
}
}
}
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

View File

@@ -961,7 +961,8 @@ mod tests {
}
async fn round_trip_test_compressed(blobs: &[Vec<u8>], compression: bool) -> Result<(), Error> {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
let (_temp_dir, pathbuf, offsets) =
write_maybe_compressed(blobs, compression, &ctx).await?;

View File

@@ -26,15 +26,14 @@ use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
use owned_buffers_io::io_buf_ext::FullSlice;
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
pub use pageserver_api::models::virtual_file as api;
use pageserver_api::shard::TenantShardId;
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tokio::time::Instant;
use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
use crate::assert_u64_eq_usize::UsizeIsU64;
use crate::context::RequestContext;
use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC, StorageIoOperation};
use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
use crate::page_cache::{PAGE_SZ, PageWriteGuard};
use crate::tenant::TENANTS_SEGMENT_NAME;
pub(crate) mod io_engine;
pub use io_engine::{
FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test,
@@ -121,7 +120,7 @@ impl VirtualFile {
pub async fn open_with_options<P: AsRef<Utf8Path>>(
path: P,
open_options: &OpenOptions,
ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
ctx: &RequestContext,
) -> Result<Self, std::io::Error> {
let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
Ok(VirtualFile {
@@ -133,7 +132,7 @@ impl VirtualFile {
pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
path: P,
open_options: &OpenOptions,
ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
ctx: &RequestContext,
) -> Result<Self, std::io::Error> {
let file = match get_io_mode() {
IoMode::Buffered => {
@@ -300,13 +299,6 @@ pub struct VirtualFileInner {
/// storing it here.
pub path: Utf8PathBuf,
open_options: OpenOptions,
// These are strings becase we only use them for metrics, and those expect strings.
// It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
// strings.
tenant_id: String,
shard_id: String,
timeline_id: String,
}
#[derive(Debug, PartialEq, Clone, Copy)]
@@ -588,36 +580,16 @@ impl VirtualFileInner {
pub async fn open_with_options<P: AsRef<Utf8Path>>(
path: P,
open_options: &OpenOptions,
_ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
_ctx: &RequestContext,
) -> Result<VirtualFileInner, std::io::Error> {
let path_ref = path.as_ref();
let path_str = path_ref.to_string();
let parts = path_str.split('/').collect::<Vec<&str>>();
let (tenant_id, shard_id, timeline_id) =
if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME {
let tenant_shard_part = parts[parts.len() - 4];
let (tenant_id, shard_id) = match tenant_shard_part.parse::<TenantShardId>() {
Ok(tenant_shard_id) => (
tenant_shard_id.tenant_id.to_string(),
format!("{}", tenant_shard_id.shard_slug()),
),
Err(_) => {
// Malformed path: this ID is just for observability, so tolerate it
// and pass through
(tenant_shard_part.to_string(), "*".to_string())
}
};
(tenant_id, shard_id, parts[parts.len() - 2].to_string())
} else {
("*".to_string(), "*".to_string(), "*".to_string())
};
let path = path.as_ref();
let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
// where our caller doesn't get to use the returned VirtualFile before its
// slot gets re-used by someone else.
let file = observe_duration!(StorageIoOperation::Open, {
open_options.open(path_ref.as_std_path()).await?
open_options.open(path.as_std_path()).await?
});
// Strip all options other than read and write.
@@ -633,11 +605,8 @@ impl VirtualFileInner {
let vfile = VirtualFileInner {
handle: RwLock::new(handle),
pos: 0,
path: path_ref.to_path_buf(),
path: path.to_owned(),
open_options: reopen_options,
tenant_id,
shard_id,
timeline_id,
};
// TODO: Under pressure, it's likely the slot will get re-used and
@@ -934,7 +903,7 @@ impl VirtualFileInner {
&self,
buf: tokio_epoll_uring::Slice<Buf>,
offset: u64,
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
ctx: &RequestContext,
) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
where
Buf: tokio_epoll_uring::IoBufMut + Send,
@@ -952,14 +921,7 @@ impl VirtualFileInner {
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
if let Ok(size) = res {
STORAGE_IO_SIZE
.with_label_values(&[
"read",
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
])
.add(size as i64);
ctx.io_size_metrics().read.add(size.into_u64());
}
(buf, res)
})
@@ -970,9 +932,9 @@ impl VirtualFileInner {
&self,
buf: FullSlice<B>,
offset: u64,
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
ctx: &RequestContext,
) -> (FullSlice<B>, Result<usize, Error>) {
let (slice, result) = self.write_at_inner(buf, offset, _ctx).await;
let (slice, result) = self.write_at_inner(buf, offset, ctx).await;
let result = result.maybe_fatal_err("write_at");
(slice, result)
}
@@ -981,7 +943,7 @@ impl VirtualFileInner {
&self,
buf: FullSlice<B>,
offset: u64,
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
ctx: &RequestContext,
) -> (FullSlice<B>, Result<usize, Error>) {
let file_guard = match self.lock_file().await {
Ok(file_guard) => file_guard,
@@ -991,14 +953,7 @@ impl VirtualFileInner {
let ((_file_guard, buf), result) =
io_engine::get().write_at(file_guard, offset, buf).await;
if let Ok(size) = result {
STORAGE_IO_SIZE
.with_label_values(&[
"write",
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
])
.add(size as i64);
ctx.io_size_metrics().write.add(size.into_u64());
}
(buf, result)
})
@@ -1584,7 +1539,8 @@ mod tests {
where
A: Adapter,
{
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
let testdir = crate::config::PageServerConf::test_repo_dir(testname);
std::fs::create_dir_all(&testdir)?;
@@ -1711,7 +1667,8 @@ mod tests {
const THREADS: usize = 100;
const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
std::fs::create_dir_all(&testdir)?;
@@ -1770,7 +1727,8 @@ mod tests {
#[tokio::test]
async fn test_atomic_overwrite_basic() {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
std::fs::create_dir_all(&testdir).unwrap();
@@ -1798,7 +1756,8 @@ mod tests {
#[tokio::test]
async fn test_atomic_overwrite_preexisting_tmp() {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let ctx =
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
let testdir =
crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
std::fs::create_dir_all(&testdir).unwrap();

View File

@@ -181,7 +181,8 @@ where
Err(self
.shutdown()
.await
.expect_err("flush task only disconnects duplex if it exits with an error"))
.err()
.expect("flush task only disconnects duplex if it exits with an error"))
}
/// Cleans up the channel, join the flush task.

View File

@@ -136,7 +136,9 @@ impl WalRedoProcess {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
if !output.contains("LOG:") {
error!(%output, "received output");
}
}
Err(e) => {
break Err(e);