Resolve merge conflicts

Fix rust formatting
Make it possible to control lazy_sru_download through tenant config
2026-05-29 11:00:38 +00:00 · 2025-04-16 08:20:46 +03:00 · 2025-04-16 07:49:47 +03:00 · 2025-04-16 07:49:47 +03:00 · 2025-04-16 07:49:46 +03:00 · 2025-04-16 07:49:44 +03:00
77 changed files with 6216 additions and 848 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1432,6 +1432,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "pem",
+ "pkcs8 0.10.2",
 "postgres_backend",
 "postgres_connection",
 "regex",
@@ -1441,7 +1442,6 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
- "spki 0.7.3",
 "storage_broker",
 "thiserror 1.0.69",
 "tokio",
@@ -8469,6 +8469,7 @@ dependencies = [
 "once_cell",
 "p256 0.13.2",
 "parquet",
+ "pkcs8 0.10.2",
 "prettyplease",
 "proc-macro2",
 "prost 0.13.3",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -143,6 +143,7 @@ parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
 pin-project-lite = "0.2"
+pkcs8 = "0.10.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
@@ -175,7 +176,6 @@ signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
-spki = "0.7.3"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
--- a/compute/patches/pgvector.patch
+++ b/compute/patches/pgvector.patch
@@ -15,7 +15,7 @@ index 7a4b88c..56678af 100644
 HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
 
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index b667478..1298aa1 100644
+index b667478..dc95d89 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
@@ -36,7 +36,7 @@ index b667478..1298aa1 100644
 	/* Close relations within worker */
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
-@@ -1100,13 +1108,25 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1100,12 +1108,39 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
 	SeedRandom(42);
 #endif
 
@@ -48,17 +48,32 @@ index b667478..1298aa1 100644
 
 	BuildGraph(buildstate, forkNum);
 
+-	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
 +#ifdef NEON_SMGR
 +	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
- 	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
+	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
- 
+#ifdef NEON_SMGR
+		{
+#if PG_VERSION_NUM >= 160000
+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+			if (set_lwlsn_block_range_hook)
+				set_lwlsn_block_range_hook(XactLastRecEnd, rlocator,
+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+			if (set_lwlsn_relation_hook)
+				set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+		}
+#endif
+	}
+
 +#ifdef NEON_SMGR
 +	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
-+
+ 
 	FreeBuildState(buildstate);
 }
- 
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
@@ -1,5 +1,5 @@
 diff --git a/src/ruminsert.c b/src/ruminsert.c
-index 255e616..1c6edb7 100644
+index 255e616..7a2240f 100644
 --- a/src/ruminsert.c
 +++ b/src/ruminsert.c
@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
@@ -24,12 +24,24 @@ index 255e616..1c6edb7 100644
 	/*
 	 * Write index to xlog
 	 */
-@@ -713,6 +721,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+@@ -713,6 +721,22 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
 		UnlockReleaseBuffer(buffer);
 	}
 
 +#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(index->rd_smgr);
+	{
+#if PG_VERSION_NUM >= 160000
+		RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+		RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+		if (set_lwlsn_block_range_hook)
+			set_lwlsn_block_range_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+		if (set_lwlsn_relation_hook)
+			set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+
+		smgr_end_unlogged_build(index->rd_smgr);
+	}
 +#endif
 +
 	/*
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -897,32 +897,28 @@ impl ComputeNode {
        let mut client = config.connect(NoTls)?;
        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;

-        let basebackup_cmd = match lsn {
-            Lsn(0) => {
-                if spec.spec.mode != ComputeMode::Primary {
-                    format!(
-                        "basebackup {} {} --gzip --replica",
-                        spec.tenant_id, spec.timeline_id
-                    )
-                } else {
-                    format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
-                }
-            }
-            _ => {
-                if spec.spec.mode != ComputeMode::Primary {
-                    format!(
-                        "basebackup {} {} {} --gzip --replica",
-                        spec.tenant_id, spec.timeline_id, lsn
-                    )
-                } else {
-                    format!(
-                        "basebackup {} {} {} --gzip",
-                        spec.tenant_id, spec.timeline_id, lsn
-                    )
-                }
-            }
-        };
-
+        let tenant_id = spec.tenant_id.to_string();
+        let timeline_id = spec.timeline_id.to_string();
+        let lsn_str = lsn.to_string();
+        let mut cmd = Vec::new();
+        cmd.push("basebackup");
+        cmd.push(&tenant_id);
+        cmd.push(&timeline_id);
+        if lsn != Lsn::INVALID {
+            cmd.push(&lsn_str);
+        }
+        cmd.push("--gzip");
+        if spec.spec.mode != ComputeMode::Primary {
+            cmd.push("--replica");
+        }
+        if spec
+            .spec
+            .features
+            .contains(&ComputeFeature::LazySlruDownload)
+        {
+            cmd.push("--lazy-slru-download")
+        }
+        let basebackup_cmd = cmd.join(" ");
        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
        let mut measured_reader = MeasuredReader::new(copyreader);
        let mut bufreader = std::io::BufReader::new(&mut measured_reader);
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -1,7 +1,7 @@
-use std::collections::HashSet;
+use std::{collections::HashSet, net::SocketAddr};

 use anyhow::{Result, anyhow};
-use axum::{RequestExt, body::Body};
+use axum::{RequestExt, body::Body, extract::ConnectInfo};
 use axum_extra::{
    TypedHeader,
    headers::{Authorization, authorization::Bearer},
@@ -64,6 +64,19 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
                return Ok(request);
            }

+            let connect_info = request
+                .extract_parts::<ConnectInfo<SocketAddr>>()
+                .await
+                .unwrap();
+
+            // In the event the request is coming from the loopback interface,
+            // allow all requests
+            if connect_info.ip().is_loopback() {
+                warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
+
+                return Ok(request);
+            }
+
            let TypedHeader(Authorization(bearer)) = request
                .extract_parts::<TypedHeader<Authorization<Bearer>>>()
                .await
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -16,6 +16,7 @@ jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pem.workspace = true
+pkcs8.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
 regex.workspace = true
@@ -24,7 +25,6 @@ scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sha2.workspace = true
-spki.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -60,12 +60,11 @@ use jsonwebtoken::jwk::{
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
 use pem::Pem;
+use pkcs8::der::Decode;
 use reqwest::header::CONTENT_TYPE;
 use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
-use spki::der::Decode;
-use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -148,12 +147,11 @@ impl ComputeControlPlane {

    /// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
    /// from the production control plane.
-    fn create_jwks_from_pem(pem: &Pem) -> Result<JwkSet> {
-        let spki: SubjectPublicKeyInfoRef = SubjectPublicKeyInfo::from_der(pem.contents())?;
-        let public_key = spki.subject_public_key.raw_bytes();
+    fn create_jwks_from_pem(pem: Pem) -> Result<JwkSet> {
+        let document = pkcs8::Document::from_der(&pem.into_contents())?;

        let mut hasher = Sha256::new();
-        hasher.update(public_key);
+        hasher.update(&document);
        let key_hash = hasher.finalize();

        Ok(JwkSet {
@@ -171,7 +169,7 @@ impl ComputeControlPlane {
                algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
                    key_type: OctetKeyPairType::OctetKeyPair,
                    curve: EllipticCurve::Ed25519,
-                    x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
+                    x: base64::encode_config(&document, base64::URL_SAFE_NO_PAD),
                }),
            }],
        })
@@ -195,7 +193,7 @@ impl ComputeControlPlane {
        let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
        let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
        let compute_ctl_config = ComputeCtlConfig {
-            jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
+            jwks: Self::create_jwks_from_pem(self.env.read_public_key()?)?,
            tls: None::<TlsConfig>,
        };
        let ep = Arc::new(Endpoint {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -413,11 +413,6 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("Failed to parse 'compaction_algorithm' json")?,
-            compaction_shard_ancestor: settings
-                .remove("compaction_shard_ancestor")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'compaction_shard_ancestor' as a bool")?,
            compaction_l0_first: settings
                .remove("compaction_l0_first")
                .map(|x| x.parse::<bool>())
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -183,6 +183,9 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

+    /// Download all SLRU files on demand
+    LazySlruDownload,
+
    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
    /// `parse_unknown_features()` for more details.
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -76,7 +76,14 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
    mfs
 }

-
+static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "libmetrics_disk_io_bytes_total",
+        "Bytes written and read from disk, grouped by the operation (read|write)",
+        &["io_operation"]
+    )
+    .expect("Failed to register disk i/o bytes int gauge vec")
+});

 static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
@@ -254,7 +261,12 @@ const BYTES_IN_BLOCK: i64 = 512;
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

-    
+    DISK_IO_BYTES
+        .with_label_values(&["read"])
+        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
+    DISK_IO_BYTES
+        .with_label_values(&["write"])
+        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);

    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
    #[cfg(target_os = "macos")]
@@ -345,7 +357,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }

-    
+    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
+        res[0] = self.inc.remove_label_values(vals);
+        res[1] = self.dec.remove_label_values(vals);
+    }
 }

 impl<P: Atomic> GenericCounterPair<P> {
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -379,8 +379,6 @@ pub struct TenantConfigToml {
    /// size exceeds `compaction_upper_limit * checkpoint_distance`.
    pub compaction_upper_limit: usize,
    pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
-    /// If true, enable shard ancestor compaction (enabled by default).
-    pub compaction_shard_ancestor: bool,
    /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
    /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
    pub compaction_l0_first: bool,
@@ -679,7 +677,6 @@ pub mod tenant_conf_defaults {

    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
-    pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;

    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
@@ -737,7 +734,6 @@ impl Default for TenantConfigToml {
            compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                kind: DEFAULT_COMPACTION_ALGORITHM,
            },
-            compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR,
            compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
            compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
            l0_flush_delay_threshold: None,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -526,8 +526,6 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
-    pub compaction_shard_ancestor: FieldPatch<bool>,
-    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub compaction_l0_first: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub compaction_l0_semaphore: FieldPatch<bool>,
@@ -617,9 +615,6 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub compaction_shard_ancestor: Option<bool>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    pub compaction_l0_first: Option<bool>,

@@ -729,7 +724,6 @@ impl TenantConfig {
            mut compaction_threshold,
            mut compaction_upper_limit,
            mut compaction_algorithm,
-            mut compaction_shard_ancestor,
            mut compaction_l0_first,
            mut compaction_l0_semaphore,
            mut l0_flush_delay_threshold,
@@ -778,9 +772,6 @@ impl TenantConfig {
            .compaction_upper_limit
            .apply(&mut compaction_upper_limit);
        patch.compaction_algorithm.apply(&mut compaction_algorithm);
-        patch
-            .compaction_shard_ancestor
-            .apply(&mut compaction_shard_ancestor);
        patch.compaction_l0_first.apply(&mut compaction_l0_first);
        patch
            .compaction_l0_semaphore
@@ -869,7 +860,6 @@ impl TenantConfig {
            compaction_threshold,
            compaction_upper_limit,
            compaction_algorithm,
-            compaction_shard_ancestor,
            compaction_l0_first,
            compaction_l0_semaphore,
            l0_flush_delay_threshold,
@@ -930,9 +920,6 @@ impl TenantConfig {
                .as_ref()
                .unwrap_or(&global_conf.compaction_algorithm)
                .clone(),
-            compaction_shard_ancestor: self
-                .compaction_shard_ancestor
-                .unwrap_or(global_conf.compaction_shard_ancestor),
            compaction_l0_first: self
                .compaction_l0_first
                .unwrap_or(global_conf.compaction_l0_first),
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -23,6 +23,7 @@ use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::{StreamExt, TryStreamExt};
 use http_types::{StatusCode, Url};
+use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use utils::backoff;
@@ -31,7 +32,7 @@ use utils::backoff::exponential_backoff_duration_seconds;
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use crate::config::AzureConfig;
 use crate::error::Cancelled;
-use crate::metrics::RequestKind;
+use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
    ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
@@ -164,7 +165,7 @@ impl AzureBlobStorage {
        let mut last_modified = None;
        let mut metadata = HashMap::new();

-      
+        let started_at = start_measuring_requests(kind);

        let download = async {
            let response = builder
@@ -236,8 +237,19 @@ impl AzureBlobStorage {
                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
            },
        };
-        
-      
+        let started_at = ScopeGuard::into_inner(started_at);
+        let outcome = match &download {
+            Ok(_) => AttemptOutcome::Ok,
+            // At this level in the stack 404 and 304 responses do not indicate an error.
+            // There's expected cases when a blob may not exist or hasn't been modified since
+            // the last get (e.g. probing for timeline indices and heatmap downloads).
+            // Callers should handle errors if they are unexpected.
+            Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
        download
    }

@@ -419,7 +431,7 @@ impl RemoteStorage for AzureBlobStorage {
        let kind = RequestKind::Head;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
        let properties_future = blob_client.get_properties().into_future();
@@ -431,9 +443,12 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        if let Ok(_inner) = &res {
-            
-         
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
        }

        let data = match res {
@@ -461,7 +476,7 @@ impl RemoteStorage for AzureBlobStorage {
        let kind = RequestKind::Put;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -494,7 +509,14 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-      
+        let outcome = match res {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);

        res
    }
@@ -540,7 +562,7 @@ impl RemoteStorage for AzureBlobStorage {
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Delete;
        let _permit = self.permit(kind, cancel).await?;
-    
+        let started_at = start_measuring_requests(kind);

        let op = async {
            // TODO batch requests are not supported by the SDK
@@ -606,8 +628,10 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
        res
    }

@@ -623,7 +647,7 @@ impl RemoteStorage for AzureBlobStorage {
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Copy;
        let _permit = self.permit(kind, cancel).await?;
-  
+        let started_at = start_measuring_requests(kind);

        let timeout = tokio::time::sleep(self.timeout);

@@ -677,8 +701,10 @@ impl RemoteStorage for AzureBlobStorage {
            },
        };

-        
-      
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
        res
    }

--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -1,7 +1,9 @@
+use metrics::{
+    Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec,
+};
+use once_cell::sync::Lazy;

-
-
-
+pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);

 #[derive(Clone, Copy, Debug)]
 pub(crate) enum RequestKind {
@@ -14,9 +16,62 @@ pub(crate) enum RequestKind {
    Head = 6,
 }

-
+use RequestKind::*;
 use scopeguard::ScopeGuard;

+impl RequestKind {
+    const fn as_str(&self) -> &'static str {
+        match self {
+            Get => "get_object",
+            Put => "put_object",
+            Delete => "delete_object",
+            List => "list_objects",
+            Copy => "copy_object",
+            TimeTravel => "time_travel_recover",
+            Head => "head_object",
+        }
+    }
+    const fn as_index(&self) -> usize {
+        *self as usize
+    }
+}
+
+const REQUEST_KIND_COUNT: usize = 7;
+pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
+
+impl<C> RequestTyped<C> {
+    pub(crate) fn get(&self, kind: RequestKind) -> &C {
+        &self.0[kind.as_index()]
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
+        use RequestKind::*;
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
+        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
+            let next = it.next().unwrap();
+            assert_eq!(index, next.as_index());
+            f(next)
+        });
+
+        if let Some(next) = it.next() {
+            panic!("unexpected {next:?}");
+        }
+
+        RequestTyped(arr)
+    }
+}
+
+impl RequestTyped<Histogram> {
+    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+        self.get(kind).observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(crate) struct PassFailCancelledRequestTyped<C> {
+    success: RequestTyped<C>,
+    fail: RequestTyped<C>,
+    cancelled: RequestTyped<C>,
+}

 #[derive(Debug, Clone, Copy)]
 pub(crate) enum AttemptOutcome {
@@ -34,22 +89,138 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
    }
 }

+impl AttemptOutcome {
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            AttemptOutcome::Ok => "ok",
+            AttemptOutcome::Err => "err",
+            AttemptOutcome::Cancelled => "cancelled",
+        }
+    }
+}

+impl<C> PassFailCancelledRequestTyped<C> {
+    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+        let target = match outcome {
+            AttemptOutcome::Ok => &self.success,
+            AttemptOutcome::Err => &self.fail,
+            AttemptOutcome::Cancelled => &self.cancelled,
+        };
+        target.get(kind)
+    }

+    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
+        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
+        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
+        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));

+        PassFailCancelledRequestTyped {
+            success,
+            fail,
+            cancelled,
+        }
+    }
+}

+impl PassFailCancelledRequestTyped<Histogram> {
+    pub(crate) fn observe_elapsed(
+        &self,
+        kind: RequestKind,
+        outcome: impl Into<AttemptOutcome>,
+        started_at: std::time::Instant,
+    ) {
+        self.get(kind, outcome.into())
+            .observe(started_at.elapsed().as_secs_f64())
+    }
+}

-
-
-
-/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
-pub(crate) fn start_measuring_requests(
-    _kind: RequestKind,
+/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
+pub(crate) fn start_counting_cancelled_wait(
+    kind: RequestKind,
 ) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_started_at| {
-        
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        crate::metrics::BUCKET_METRICS
+            .cancelled_waits
+            .get(kind)
+            .inc()
    })
 }

+/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
+pub(crate) fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}

+pub(crate) struct BucketMetrics {
+    /// Full request duration until successful completion, error or cancellation.
+    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    /// Total amount of seconds waited on queue.
+    pub(crate) wait_seconds: RequestTyped<Histogram>,

+    /// Track how many semaphore awaits were cancelled per request type.
+    ///
+    /// This is in case cancellations are happening more than expected.
+    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
+
+    /// Total amount of deleted objects in batches or single requests.
+    pub(crate) deleted_objects_total: IntCounter,
+}
+
+impl Default for BucketMetrics {
+    fn default() -> Self {
+        // first bucket 100 microseconds to count requests that do not need to wait at all
+        // and get a permit immediately
+        let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+
+        let req_seconds = register_histogram_vec!(
+            "remote_storage_s3_request_seconds",
+            "Seconds to complete a request",
+            &["request_type", "result"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
+            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
+        });
+
+        let wait_seconds = register_histogram_vec!(
+            "remote_storage_s3_wait_seconds",
+            "Seconds rate limited",
+            &["request_type"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let wait_seconds =
+            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
+
+        let cancelled_waits = register_int_counter_vec!(
+            "remote_storage_s3_cancelled_waits_total",
+            "Times a semaphore wait has been cancelled per request type",
+            &["request_type"],
+        )
+        .unwrap();
+        let cancelled_waits =
+            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
+
+        let deleted_objects_total = register_int_counter!(
+            "remote_storage_s3_deleted_objects_total",
+            "Amount of deleted objects in total",
+        )
+        .unwrap();
+
+        Self {
+            req_seconds,
+            wait_seconds,
+            cancelled_waits,
+            deleted_objects_total,
+        }
+    }
+}
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -41,7 +41,7 @@ use super::StorageMetadata;
 use crate::config::S3Config;
 use crate::error::Cancelled;
 pub(super) use crate::metrics::RequestKind;
-use crate::metrics::{AttemptOutcome, start_measuring_requests};
+use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests};
 use crate::support::PermitCarrying;
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
@@ -199,7 +199,7 @@ impl S3Bucket {
        kind: RequestKind,
        cancel: &CancellationToken,
    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
-       
+        let started_at = start_counting_cancelled_wait(kind);
        let acquire = self.concurrency_limiter.acquire(kind);

        let permit = tokio::select! {
@@ -207,8 +207,10 @@ impl S3Bucket {
            _ = cancel.cancelled() => return Err(Cancelled),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);

        Ok(permit)
    }
@@ -218,7 +220,7 @@ impl S3Bucket {
        kind: RequestKind,
        cancel: &CancellationToken,
    ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
-       
+        let started_at = start_counting_cancelled_wait(kind);
        let acquire = self.concurrency_limiter.acquire_owned(kind);

        let permit = tokio::select! {
@@ -226,8 +228,10 @@ impl S3Bucket {
            _ = cancel.cancelled() => return Err(Cancelled),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
        Ok(permit)
    }

@@ -269,7 +273,11 @@ impl S3Bucket {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::NotFound);
            }
            Err(SdkError::ServiceError(e))
@@ -279,11 +287,19 @@ impl S3Bucket {
                if e.raw().status().as_u16() == StatusCode::NotModified =>
            {
                // Count an unmodified file as a success.
-               
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::Unmodified);
            }
            Err(e) => {
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );

                return Err(DownloadError::Other(
                    anyhow::Error::new(e).context("download s3 object"),
@@ -330,11 +346,11 @@ impl S3Bucket {
        delete_objects: &[ObjectIdentifier],
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-   
+        let kind = RequestKind::Delete;
        let mut cancel = std::pin::pin!(cancel.cancelled());

        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
-           
+            let started_at = start_measuring_requests(kind);

            let req = self
                .client
@@ -354,10 +370,15 @@ impl S3Bucket {
                _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
            };

-          
-            
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
            let resp = resp.context("request deletion")?;
-            
+            crate::metrics::BUCKET_METRICS
+                .deleted_objects_total
+                .inc_by(chunk.len() as u64);

            if let Some(errors) = resp.errors {
                // Log a bounded number of the errors within the response:
@@ -424,8 +445,8 @@ pin_project_lite::pin_project! {
    }

    impl<S> PinnedDrop for TimedDownload<S> {
-        fn drop(mut _this: Pin<&mut Self>) {
-           
+        fn drop(mut this: Pin<&mut Self>) {
+            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
        }
    }
 }
@@ -490,7 +511,7 @@ impl RemoteStorage for S3Bucket {

            let mut continuation_token = None;
            'outer: loop {
-           
+                let started_at = start_measuring_requests(kind);

                // min of two Options, returning Some if one is value and another is
                // None (None is smaller than anything, so plain min doesn't work).
@@ -523,9 +544,11 @@ impl RemoteStorage for S3Bucket {
                    .context("Failed to list S3 prefixes")
                    .map_err(DownloadError::Other);

-               
+                let started_at = ScopeGuard::into_inner(started_at);

-                
+                crate::metrics::BUCKET_METRICS
+                    .req_seconds
+                    .observe_elapsed(kind, &response, started_at);

                let response = match response {
                    Ok(response) => response,
@@ -606,7 +629,7 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Head;
        let _permit = self.permit(kind, cancel).await?;

-     
+        let started_at = start_measuring_requests(kind);

        let head_future = self
            .client
@@ -625,18 +648,30 @@ impl RemoteStorage for S3Bucket {
        let res = res.map_err(|_e| DownloadError::Timeout)?;

        // do not incl. timeouts as errors in metrics but cancellations
-  
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
        let data = match res {
            Ok(object_output) => object_output,
            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::NotFound);
            }
            Err(e) => {
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );

                return Err(DownloadError::Other(
                    anyhow::Error::new(e).context("s3 head object"),
@@ -669,7 +704,7 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Put;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
        let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
@@ -692,10 +727,12 @@ impl RemoteStorage for S3Bucket {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        if let Ok(_inner) = &res {
+        if let Ok(inner) = &res {
            // do not incl. timeouts as errors in metrics but cancellations
-       
-            
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
        }

        match res {
@@ -716,7 +753,7 @@ impl RemoteStorage for S3Bucket {

        let timeout = tokio::time::sleep(self.timeout);

-       
+        let started_at = start_measuring_requests(kind);

        // we need to specify bucket_name as a prefix
        let copy_source = format!(
@@ -740,8 +777,10 @@ impl RemoteStorage for S3Bucket {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);

        res?;

--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -1,6 +1,7 @@
 use std::fmt::Display;
 use std::time::{Duration, Instant};

+use metrics::IntCounter;

 /// Circuit breakers are for operations that are expensive and fallible.
 ///
@@ -53,7 +54,7 @@ impl CircuitBreaker {
        }
    }

-    pub fn fail<E>(&mut self,  error: E)
+    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
    where
        E: Display,
    {
@@ -63,18 +64,18 @@ impl CircuitBreaker {

        self.fail_count += 1;
        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
-            self.break_circuit( error);
+            self.break_circuit(metric, error);
        }
    }

    /// Call this after successfully executing an operation
-    pub fn success(&mut self) {
+    pub fn success(&mut self, metric: &IntCounter) {
        self.fail_count = 0;
        if let Some(broken_at) = &self.broken_at {
            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
                humantime::format_duration(broken_at.elapsed()));
            self.broken_at = None;
-          
+            metric.inc();
        }
    }

@@ -97,13 +98,13 @@ impl CircuitBreaker {
        }
    }

-    fn break_circuit<E>(&mut self,  error: E)
+    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
    where
        E: Display,
    {
        self.broken_at = Some(Instant::now());
        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
-        
+        metric.inc();
    }

    fn reset_circuit(&mut self) {
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -73,6 +73,7 @@ impl From<GetVectoredError> for BasebackupError {
 ///  * When working without safekeepers. In this situation it is important to match the lsn
 ///    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
 ///    to start the replication.
+#[allow(clippy::too_many_arguments)]
 pub async fn send_basebackup_tarball<'a, W>(
    write: &'a mut W,
    timeline: &'a Timeline,
@@ -80,6 +81,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    replica: bool,
+    lazy_slru_download_enabled: bool,
    ctx: &'a RequestContext,
 ) -> Result<(), BasebackupError>
 where
@@ -131,8 +133,8 @@ where
    };

    info!(
-        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
-        backup_lsn, prev_lsn, full_backup, replica
+        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={}, lazy_slru_download_enabled={})",
+        backup_lsn, prev_lsn, full_backup, replica, lazy_slru_download_enabled
    );

    let basebackup = Basebackup {
@@ -142,6 +144,7 @@ where
        prev_record_lsn: prev_lsn,
        full_backup,
        replica,
+        lazy_slru_download_enabled,
        ctx,
        io_concurrency: IoConcurrency::spawn_from_conf(
            timeline.conf,
@@ -170,6 +173,7 @@ where
    prev_record_lsn: Lsn,
    full_backup: bool,
    replica: bool,
+    lazy_slru_download_enabled: bool,
    ctx: &'a RequestContext,
    io_concurrency: IoConcurrency,
 }
@@ -308,7 +312,10 @@ where
                self.timeline.pg_version,
            )?;

-        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
+        let lazy_slru_download = self
+            .timeline
+            .get_lazy_slru_download(self.lazy_slru_download_enabled)
+            && !self.full_backup;

        let pgversion = self.timeline.pg_version;
        let subdirs = dispatch_pgversion!(pgversion, &pgv::bindings::PGDATA_SUBDIRS[..]);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,6 +20,7 @@ use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::{
    BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
 };
@@ -320,9 +321,10 @@ where
    }
 }

-fn startup_checkpoint(started_at: Instant, _phase: &str, human_phase: &str) {
+fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
    let elapsed = started_at.elapsed();
    let secs = elapsed.as_secs_f64();
+    STARTUP_DURATION.with_label_values(&[phase]).set(secs);

    info!(
        elapsed_ms = elapsed.as_millis(),
@@ -353,7 +355,10 @@ fn start_pageserver(
    set_launch_timestamp_metric(launch_ts);
    #[cfg(target_os = "linux")]
    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
-    
+    metrics::register_internal(Box::new(
+        pageserver::metrics::tokio_epoll_uring::Collector::new(),
+    ))
+    .unwrap();
    pageserver::preinitialize_metrics(conf, ignored);

    // If any failpoints were set from FAILPOINTS environment variable,
@@ -497,6 +502,7 @@ fn start_pageserver(
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
+    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
    //
@@ -572,6 +578,7 @@ fn start_pageserver(
                    "initial_tenant_load",
                    "Initial load completed",
                );
+                STARTUP_IS_LOADING.set(0);
            });

            let WaitForPhaseResult {
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -261,7 +261,7 @@ where
    let mut tenants = std::pin::pin!(tenants);

    while let Some((tenant_id, tenant)) = tenants.next().await {
-        let tenant_resident_size = 0;
+        let mut tenant_resident_size = 0;

        for timeline in tenant.list_timelines() {
            let timeline_id = timeline.timeline_id;
@@ -286,6 +286,7 @@ where
                }
            }

+            tenant_resident_size += timeline.resident_physical_size();
        }

        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -91,12 +91,12 @@

 use std::{sync::Arc, time::Duration};

-
+use once_cell::sync::Lazy;
 use tracing::warn;
 use utils::{id::TimelineId, shard::TenantShardId};

 use crate::{
-    metrics::TimelineMetrics,
+    metrics::{StorageIoSizeMetrics, TimelineMetrics},
    task_mgr::TaskKind,
    tenant::Timeline,
 };
@@ -122,35 +122,38 @@ pub struct RequestContext {
 #[derive(Clone)]
 pub(crate) enum Scope {
    Global {
-        
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    SecondaryTenant {
-       
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    SecondaryTimeline {
-       
+        io_size_metrics: crate::metrics::StorageIoSizeMetrics,
    },
    Timeline {
-       // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
+        // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
        // context creation contending for the ref counters of the Arc<TimelineMetrics>,
        // which are shared among all tasks that operate on the timeline, especially
        // concurrent page_service connections.
        #[allow(clippy::redundant_allocation)]
-        #[allow(dead_code)]
-        arc_arc: Arc<Arc<TimelineMetrics>>,    },
+        arc_arc: Arc<Arc<TimelineMetrics>>,
+    },
    #[cfg(test)]
    UnitTest {
-       
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    DebugTools {
-        
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
 }

+static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
+    Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));

 impl Scope {
    pub(crate) fn new_global() -> Self {
        Scope::Global {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
    /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
@@ -170,13 +173,18 @@ impl Scope {
        }
    }
    pub(crate) fn new_secondary_timeline(
-        _tenant_shard_id: &TenantShardId,
-        _timeline_id: &TimelineId,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
    ) -> Self {
        // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.

+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = tenant_shard_id.shard_slug().to_string();
+        let timeline_id = timeline_id.to_string();

-        Scope::SecondaryTimeline {  }
+        let io_size_metrics =
+            crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
+        Scope::SecondaryTimeline { io_size_metrics }
    }
    pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
        // Before propagating metrics via RequestContext, the labels were inferred from file path.
@@ -189,19 +197,19 @@ impl Scope {
        // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
        // at this point, so, we were able to completely side-step tenant-scoped stuff there).
        Scope::SecondaryTenant {
-           
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
    #[cfg(test)]
    pub(crate) fn new_unit_test() -> Self {
        Scope::UnitTest {
-          
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }

    pub(crate) fn new_debug_tools() -> Self {
        Scope::DebugTools {
-          
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
 }
@@ -515,18 +523,58 @@ impl RequestContext {
        self.access_stats_behavior
    }

+    pub(crate) fn page_content_kind(&self) -> PageContentKind {
+        self.page_content_kind
+    }

    pub(crate) fn read_path_debug(&self) -> bool {
        self.read_path_debug
    }

+    pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
+        match &self.scope {
+            Scope::Global { io_size_metrics } => {
+                let is_unit_test = cfg!(test);
+                let is_regress_test_build = cfg!(feature = "testing");
+                if is_unit_test || is_regress_test_build {
+                    panic!("all VirtualFile instances are timeline-scoped");
+                } else {
+                    use once_cell::sync::Lazy;
+                    use std::sync::Mutex;
+                    use std::time::Duration;
+                    use utils::rate_limit::RateLimit;
+                    static LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
+                    let mut guard = LIMIT.lock().unwrap();
+                    guard.call2(|rate_limit_stats| {
+                        warn!(
+                            %rate_limit_stats,
+                            backtrace=%std::backtrace::Backtrace::force_capture(),
+                            "all VirtualFile instances are timeline-scoped",
+                        );
+                    });
+
+                    io_size_metrics
+                }
+            }
+            Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
+            Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
+            Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
+            #[cfg(test)]
+            Scope::UnitTest { io_size_metrics } => io_size_metrics,
+            Scope::DebugTools { io_size_metrics } => io_size_metrics,
+        }
+    }
+
    pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) {
        if duration == Duration::ZERO {
            return;
        }

        match &self.scope {
-            Scope::Timeline { arc_arc: _ } => {},
+            Scope::Timeline { arc_arc } => arc_arc
+                .wait_ondemand_download_time
+                .observe(self.task_kind, duration),
            _ => {
                use once_cell::sync::Lazy;
                use std::sync::Mutex;
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -27,6 +27,7 @@ use self::list_writer::{DeletionOp, ListWriter, RecoverOp};
 use self::validator::Validator;
 use crate::config::PageServerConf;
 use crate::controller_upcall_client::StorageControllerUpcallApi;
+use crate::metrics;
 use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -162,6 +163,11 @@ struct TenantDeletionList {
    generation: Generation,
 }

+impl TenantDeletionList {
+    pub(crate) fn len(&self) -> usize {
+        self.timelines.values().map(|v| v.len()).sum()
+    }
+}

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
@@ -461,6 +467,9 @@ impl DeletionQueueClient {
        // they may be historical.
        assert!(!current_generation.is_none());

+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(layers.len() as u64);
        self.do_push(
            &self.tx,
            ListWriterQueueMessage::Delete(DeletionOp {
@@ -544,6 +553,9 @@ impl DeletionQueueClient {
        &self,
        objects: Vec<RemotePath>,
    ) -> Result<(), DeletionQueueError> {
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(objects.len() as u64);
        self.executor_tx
            .send(DeleterMessage::Delete(objects))
            .await
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -14,6 +14,7 @@ use tracing::{info, warn};
 use utils::{backoff, pausable_failpoint};

 use super::{DeletionQueueError, FlushOp};
+use crate::metrics;

 const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);

@@ -59,6 +60,10 @@ impl Deleter {
                fail::fail_point!("deletion-queue-before-execute", |_| {
                    info!("Skipping execution, failpoint set");

+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                });

@@ -85,6 +90,9 @@ impl Deleter {
                Ok(()) => {
                    // Note: we assume that the remote storage layer returns Ok(()) if some
                    // or all of the deleted objects were already gone.
+                    metrics::DELETION_QUEUE
+                        .keys_executed
+                        .inc_by(self.accumulator.len() as u64);
                    info!(
                        "Executed deletion batch {}..{}",
                        self.accumulator
@@ -101,6 +109,10 @@ impl Deleter {
                        return Err(DeletionQueueError::ShuttingDown);
                    }
                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["execute"])
+                        .inc();
                }
            };
        }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -25,6 +25,7 @@ use utils::id::TimelineId;
 use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage};
 use crate::config::PageServerConf;
 use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
 use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error};
@@ -151,7 +152,7 @@ impl ListWriter {
                }
            }
            Err(e) => {
-                
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
                warn!(
                    sequence = self.pending.sequence,
                    "Failed to write deletion list, will retry later ({e:#})"
@@ -179,6 +180,7 @@ impl ListWriter {
                        // This should never happen unless we make a mistake with our serialization.
                        // Ignoring a deletion header is not consequential for correctnes because all deletions
                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        Ok(None)
                    }
                }
@@ -247,6 +249,7 @@ impl ListWriter {
                    .as_str()
            } else {
                warn!("Unexpected key in deletion queue: {basename}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
                continue;
            };

@@ -254,6 +257,7 @@ impl ListWriter {
                Ok(s) => s,
                Err(e) => {
                    warn!("Malformed key '{basename}': {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                    continue;
                }
            };
@@ -282,6 +286,7 @@ impl ListWriter {
                    // Drop the list on the floor: any objects it referenced will be left behind
                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                    continue;
                }
            };
@@ -324,6 +329,9 @@ impl ListWriter {

            // We will drop out of recovery if this fails: it indicates that we are shutting down
            // or the backend has panicked
+            metrics::DELETION_QUEUE
+                .keys_submitted
+                .inc_by(deletion_list.len() as u64);
            self.tx
                .send(ValidatorQueueMessage::Delete(deletion_list))
                .await?;
@@ -345,6 +353,7 @@ impl ListWriter {
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
            );
+            metrics::DELETION_QUEUE.unexpected_errors.inc();
            return;
        }

@@ -413,6 +422,7 @@ impl ListWriter {
                            tracing::error!(
                                "Failed to enqueue deletions, leaking objects.  This is a bug."
                            );
+                            metrics::DELETION_QUEUE.unexpected_errors.inc();
                        }
                    }
                }
@@ -440,6 +450,7 @@ impl ListWriter {
                        tracing::error!(
                            "Deletion queue recovery called more than once.  This is a bug."
                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
                        continue;
                    }
@@ -451,6 +462,7 @@ impl ListWriter {
                        info!(
                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        return;
                    } else {
                        self.recovered = true;
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -26,6 +26,7 @@ use super::deleter::DeleterMessage;
 use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates};
 use crate::config::PageServerConf;
 use crate::controller_upcall_client::{RetryForeverError, StorageControllerUpcallApi};
+use crate::metrics;
 use crate::virtual_file::MaybeFatalIo;

 // After this length of time, do any validation work that is pending,
@@ -185,6 +186,7 @@ where
                    "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}",
                    tenant_lsn_state.generation
                );
+                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
            }
        }

@@ -219,8 +221,11 @@ where

                if !this_list_valid {
                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
-                } 
+                } else {
+                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
+                }
                this_list_valid
            });
            list.validated = true;
@@ -232,7 +237,7 @@ where
                    // Highly unexpected.  Could happen if e.g. disk full.
                    // If we didn't save the trimmed list, it is _not_ valid to execute.
                    warn!("Failed to save modified deletion list {list}: {e:#}");
-                    
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();

                    // Rather than have a complex retry process, just drop it and leak the objects,
                    // scrubber will clean up eventually.
@@ -271,7 +276,7 @@ where
                // The save() function logs a warning on error.
                if let Err(e) = header.save(self.conf).await {
                    warn!("Failed to write deletion queue header: {e:#}");
-                    
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                }
            }
        }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -56,6 +56,7 @@ use utils::completion;
 use utils::id::TimelineId;

 use crate::config::PageServerConf;
+use crate::metrics::disk_usage_based_eviction::METRICS;
 use crate::task_mgr::{self, BACKGROUND_RUNTIME};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
@@ -387,7 +388,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        }
    };

-
+    METRICS.layers_collected.inc_by(candidates.len() as u64);

    tracing::info!(
        elapsed_ms = collection_time.as_millis(),
@@ -427,7 +428,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    let (evicted_amount, usage_planned) =
        select_victims(&candidates, usage_pre).into_amount_and_planned();

- 
+    METRICS.layers_selected.inc_by(evicted_amount as u64);

    // phase2: evict layers

@@ -456,6 +457,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            if let Some(next) = next {
                match next {
                    Ok(Ok(file_size)) => {
+                        METRICS.layers_evicted.inc();
                        usage_assumed.add_available_bytes(file_size);
                    }
                    Ok(Err((
@@ -786,6 +788,7 @@ async fn collect_eviction_candidates(
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
+    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);

    // get a snapshot of the list of tenants
    let tenants = tenant_manager
@@ -819,7 +822,7 @@ async fn collect_eviction_candidates(
            continue;
        }

-        
+        let started_at = std::time::Instant::now();

        // collect layers from all timelines in this tenant
        //
@@ -914,11 +917,25 @@ async fn collect_eviction_candidates(
                    (partition, candidate)
                });

+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);

        candidates.extend(tenant_candidates);

-       
+        let elapsed = started_at.elapsed();
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());

+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -945,7 +962,7 @@ async fn collect_eviction_candidates(
            layer_info.resident_layers.len()
        );

-       
+        let started_at = std::time::Instant::now();

        layer_info
            .resident_layers
@@ -967,13 +984,28 @@ async fn collect_eviction_candidates(
                        candidate,
                    )
                });
+
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
        candidates.extend(tenant_candidates);

        tokio::task::yield_now().await;

-    
+        let elapsed = started_at.elapsed();

-        
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
    }

    debug_assert!(
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -24,6 +24,7 @@ use wal_decoder::models::InterpretedWalRecord;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::{WalIngest, WalIngestErrorKind};
@@ -323,6 +324,7 @@ async fn import_wal(
                walingest
                    .ingest_record(interpreted, &mut modification, ctx)
                    .await?;
+                WAL_INGEST.records_committed.inc();

                modification.commit(ctx).await?;
                last_lsn = lsn;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -77,6 +77,7 @@ use anyhow::Context;
 use once_cell::sync::OnceCell;

 use crate::context::RequestContext;
+use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics};
 use crate::virtual_file::{IoBufferMut, IoPageSlice};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -194,7 +195,7 @@ impl SlotInner {
 }

 pub struct PageCache {
-    immutable_page_maps: [std::sync::RwLock<HashMap<(FileId, u32), usize>>; 16],
+    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -204,103 +205,8 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
-}

-impl PageCache {
-    /// Helper function to determine the shard index based on the low 4 bits of the u32 in the key tuple.
-    fn shard_index(_file_id: &FileId, blkno: u32) -> usize {
-        (blkno & 0xF) as usize
-    }
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Returns the slot index, if any.
-    ///
-    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
-    /// get recycled for an unrelated page immediately after this function
-    /// returns. The caller is responsible for re-checking that the slot still
-    /// contains the page with the same key before using it.
-    ///
-    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
-        match cache_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let map = self.immutable_page_maps[shard_idx].read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    ///
-    /// Remove mapping for given key.
-    ///
-    fn remove_mapping(&self, old_key: &CacheKey) {
-        match old_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-            }
-        }
-    }
-
-    ///
-    /// Insert mapping for given key.
-    ///
-    /// If a mapping already existed for the given key, returns the slot index
-    /// of the existing mapping and leaves it untouched.
-    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
-        match new_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        None
-                    }
-                }
-            }
-        }
-    }
-
-    /// Initialize a new page cache
-    ///
-    /// This should be called only once at page server startup.
-    fn new(num_pages: usize) -> Self {
-        assert!(num_pages > 0, "page cache size must be > 0");
-
-        // We could use Vec::leak here, but that potentially also leaks
-        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
-        // this is avoided.
-        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
-
-        let slots = page_buffer
-            .chunks_exact_mut(PAGE_SZ)
-            .map(|chunk| {
-                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
-                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
-
-                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        permit: std::sync::Mutex::new(Weak::new()),
-                    }),
-                    usage_count: AtomicU8::new(0),
-                }
-            })
-            .collect();
-
-        Self {
-            immutable_page_maps: Default::default(),
-            slots,
-            next_evict_slot: AtomicUsize::new(0),
-            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
-        }
-    }
+    size_metrics: &'static PageCacheSizeMetrics,
 }

 struct PinnedSlotsPermit {
@@ -508,17 +414,32 @@ impl PageCache {
    async fn lock_for_read(
        &self,
        cache_key: &CacheKey,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut permit = Some(self.try_get_pinned_slot_permit().await?);

+        let (read_access, hit) = match cache_key {
+            CacheKey::ImmutableFilePage { .. } => (
+                &crate::metrics::PAGE_CACHE
+                    .for_ctx(ctx)
+                    .read_accesses_immutable,
+                &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
+            ),
+        };
+        read_access.inc();
+
+        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
                debug_assert!(permit.is_none());
+                if is_first_iteration {
+                    hit.inc();
+                }
                return Ok(ReadBufResult::Found(read_guard));
            }
            debug_assert!(permit.is_some());
+            is_first_iteration = false;

            // Not found. Find a victim buffer
            let (slot_idx, mut inner) = self
@@ -563,6 +484,63 @@ impl PageCache {
        }
    }

+    //
+    // Section 3: Mapping functions
+    //
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Returns the slot index, if any.
+    ///
+    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
+    /// get recycled for an unrelated page immediately after this function
+    /// returns.  The caller is responsible for re-checking that the slot still
+    /// contains the page with the same key before using it.
+    ///
+    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
+        match cache_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let map = self.immutable_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    ///
+    /// Remove mapping for given key.
+    ///
+    fn remove_mapping(&self, old_key: &CacheKey) {
+        match old_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let mut map = self.immutable_page_map.write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
+            }
+        }
+    }
+
+    ///
+    /// Insert mapping for given key.
+    ///
+    /// If a mapping already existed for the given key, returns the slot index
+    /// of the existing mapping and leaves it untouched.
+    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
+        match new_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let mut map = self.immutable_page_map.write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
+                        None
+                    }
+                }
+            }
+        }
+    }
+
    //
    // Section 4: Misc internal helpers
    //
@@ -617,7 +595,11 @@ impl PageCache {
                            // Note that just yielding to tokio during iteration without such
                            // priority boosting is likely counter-productive. We'd just give more opportunities
                            // for B to bump usage count, further starving A.
-                            
+                            page_cache_eviction_metrics::observe(
+                                page_cache_eviction_metrics::Outcome::ItersExceeded {
+                                    iters: iters.try_into().unwrap(),
+                                },
+                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
@@ -627,12 +609,84 @@ impl PageCache {
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
                    inner.key = None;
-                    
-                } 
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
+                } else {
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotUnused {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
+                }
                return Ok((slot_idx, inner));
            }
        }
    }

+    /// Initialize a new page cache
+    ///
+    /// This should be called only once at page server startup.
+    fn new(num_pages: usize) -> Self {
+        assert!(num_pages > 0, "page cache size must be > 0");
+
+        // We could use Vec::leak here, but that potentially also leaks
+        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
+        // this is avoided.
+        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
+
+        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
+        size_metrics.max_bytes.set_page_sz(num_pages);
+        size_metrics.current_bytes_immutable.set_page_sz(0);
+
+        let slots = page_buffer
+            .chunks_exact_mut(PAGE_SZ)
+            .map(|chunk| {
+                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
+                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
+
+                Slot {
+                    inner: tokio::sync::RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        permit: std::sync::Mutex::new(Weak::new()),
+                    }),
+                    usage_count: AtomicU8::new(0),
+                }
+            })
+            .collect();
+
+        Self {
+            immutable_page_map: Default::default(),
+            slots,
+            next_evict_slot: AtomicUsize::new(0),
+            size_metrics,
+            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
+        }
+    }
 }

+trait PageSzBytesMetric {
+    fn set_page_sz(&self, count: usize);
+    fn add_page_sz(&self, count: usize);
+    fn sub_page_sz(&self, count: usize);
+}
+
+#[inline(always)]
+fn count_times_page_sz(count: usize) -> u64 {
+    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
+}
+
+impl PageSzBytesMetric for metrics::UIntGauge {
+    fn set_page_sz(&self, count: usize) {
+        self.set(count_times_page_sz(count));
+    }
+    fn add_page_sz(&self, count: usize) {
+        self.add(count_times_page_sz(count));
+    }
+    fn sub_page_sz(&self, count: usize) {
+        self.sub(count_times_page_sz(count));
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -59,7 +59,8 @@ use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
 use crate::metrics::{
-    self, GetPageBatchBreakReason, SmgrOpTimer, TimelineMetrics,
+    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
+    SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -274,6 +275,9 @@ async fn page_service_conn_main(
    cancel: CancellationToken,
    gate_guard: GateGuard,
 ) -> ConnectionHandlerResult {
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["page_service"])
+        .guard();

    socket
        .set_nodelay(true)
@@ -637,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError {

 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
+    timer: SmgrOpTimer,
    effective_request_lsn: Lsn,
    ctx: RequestContext,
 }
@@ -644,6 +649,7 @@ struct BatchedGetPageRequest {
 #[cfg(feature = "testing")]
 struct BatchedTestRequest {
    req: models::PagestreamTestRequest,
+    timer: SmgrOpTimer,
 }

 /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
@@ -653,13 +659,13 @@ struct BatchedTestRequest {
 enum BatchedFeMessage {
    Exists {
        span: Span,
-       
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamExistsRequest,
    },
    Nblocks {
        span: Span,
-     
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamNblocksRequest,
    },
@@ -671,13 +677,13 @@ enum BatchedFeMessage {
    },
    DbSize {
        span: Span,
-  
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamDbSizeRequest,
    },
    GetSlruSegment {
        span: Span,
-   
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamGetSlruSegmentRequest,
    },
@@ -698,7 +704,27 @@ impl BatchedFeMessage {
        self.into()
    }

-    fn observe_execution_start(&mut self, _at: Instant) {
+    fn observe_execution_start(&mut self, at: Instant) {
+        match self {
+            BatchedFeMessage::Exists { timer, .. }
+            | BatchedFeMessage::Nblocks { timer, .. }
+            | BatchedFeMessage::DbSize { timer, .. }
+            | BatchedFeMessage::GetSlruSegment { timer, .. } => {
+                timer.observe_execution_start(at);
+            }
+            BatchedFeMessage::GetPage { pages, .. } => {
+                for page in pages {
+                    page.timer.observe_execution_start(at);
+                }
+            }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test { requests, .. } => {
+                for req in requests {
+                    req.timer.observe_execution_start(at);
+                }
+            }
+            BatchedFeMessage::RespondError { .. } => {}
+        }
    }

    fn should_break_batch(
@@ -938,7 +964,7 @@ impl PageServerHandler {
                    .await?;
                debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
                let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelExists,
                    received_at,
@@ -946,7 +972,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::Exists {
                    span,
-                
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -956,7 +982,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelSize,
                    received_at,
@@ -964,7 +990,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::Nblocks {
                    span,
-                   
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -974,7 +1000,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetDbSize,
                    received_at,
@@ -982,7 +1008,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::DbSize {
                    span,
-              
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -992,7 +1018,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetSlruSegment,
                    received_at,
@@ -1000,7 +1026,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::GetSlruSegment {
                    span,
-               
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -1099,7 +1125,7 @@ impl PageServerHandler {
                // request handler log messages contain the request-specific fields.
                let span = mkspan!(shard.tenant_shard_id.shard_slug());

-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetPageAtLsn,
                    received_at,
@@ -1132,6 +1158,7 @@ impl PageServerHandler {
                    shard: shard.downgrade(),
                    pages: smallvec::smallvec![BatchedGetPageRequest {
                        req,
+                        timer,
                        effective_request_lsn,
                        ctx,
                    }],
@@ -1147,12 +1174,13 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
+                let timer =
+                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
                        .await?;
                BatchedFeMessage::Test {
                    span,
                    shard: shard.downgrade(),
-                    requests: vec![BatchedTestRequest { req,  }],
+                    requests: vec![BatchedTestRequest { req, timer }],
                }
            }
        };
@@ -1253,7 +1281,7 @@ impl PageServerHandler {

        // Dispatch the batch to the appropriate request handler.
        let log_slow_name = batch.as_static_str();
-        let (handler_results, span) = {
+        let (mut handler_results, span) = {
            // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
            // won't fit on the stack.
            let mut boxpinned =
@@ -1283,31 +1311,31 @@ impl PageServerHandler {
        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
-        // let flush_timers = {
-        //     let flushing_start_time = Instant::now();
-        //     let mut flush_timers = Vec::with_capacity(handler_results.len());
-        //     for handler_result in &mut handler_results {
-        //         let flush_timer = match handler_result {
-        //             Ok((_, timer)) => Some(
-        //                 timer
-        //                     .observe_execution_end(flushing_start_time)
-        //                     .expect("we are the first caller"),
-        //             ),
-        //             Err(_) => {
-        //                 // TODO: measure errors
-        //                 None
-        //             }
-        //         };
-        //         flush_timers.push(flush_timer);
-        //     }
-        //     assert_eq!(flush_timers.len(), handler_results.len());
-        //     flush_timers
-        // };
+        let flush_timers = {
+            let flushing_start_time = Instant::now();
+            let mut flush_timers = Vec::with_capacity(handler_results.len());
+            for handler_result in &mut handler_results {
+                let flush_timer = match handler_result {
+                    Ok((_, timer)) => Some(
+                        timer
+                            .observe_execution_end(flushing_start_time)
+                            .expect("we are the first caller"),
+                    ),
+                    Err(_) => {
+                        // TODO: measure errors
+                        None
+                    }
+                };
+                flush_timers.push(flush_timer);
+            }
+            assert_eq!(flush_timers.len(), handler_results.len());
+            flush_timers
+        };

        // Map handler result to protocol behavior.
        // Some handler errors cause exit from pagestream protocol.
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        for handler_result in handler_results.into_iter() {
+        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
            let response_msg = match handler_result {
                Err(e) => match &e.err {
                    PageStreamError::Shutdown => {
@@ -1339,7 +1367,7 @@ impl PageServerHandler {
                        })
                    }
                },
-                Ok((response_msg, )) => response_msg,
+                Ok((response_msg, _op_timer_already_observed)) => response_msg,
            };

            //
@@ -1353,17 +1381,17 @@ impl PageServerHandler {
            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);

            // what we want to do
-           
+            let socket_fd = pgb_writer.socket_fd;
            let flush_fut = pgb_writer.flush();
            // metric for how long flushing takes
-            // let flush_fut = match flushing_timer {
-            //     Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
-            //         Instant::now(),
-            //         flush_fut,
-            //         socket_fd,
-            //     )),
-            //     None => futures::future::Either::Right(flush_fut),
-            // };
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
+                None => futures::future::Either::Right(flush_fut),
+            };
            // do it while respecting cancellation
            let _: () = async move {
                tokio::select! {
@@ -1393,7 +1421,7 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<
        (
-            Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>>,
+            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
            Span,
        ),
        QueryError,
@@ -1409,7 +1437,7 @@ impl PageServerHandler {
        Ok(match batch {
            BatchedFeMessage::Exists {
                span,
-               
+                timer,
                shard,
                req,
            } => {
@@ -1420,7 +1448,7 @@ impl PageServerHandler {
                        self.handle_get_rel_exists_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1428,7 +1456,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::Nblocks {
                span,
-           
+                timer,
                shard,
                req,
            } => {
@@ -1439,7 +1467,7 @@ impl PageServerHandler {
                        self.handle_get_nblocks_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1475,6 +1503,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::DbSize {
                span,
+                timer,
                shard,
                req,
            } => {
@@ -1485,7 +1514,7 @@ impl PageServerHandler {
                        self.handle_db_size_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1493,6 +1522,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::GetSlruSegment {
                span,
+                timer,
                shard,
                req,
            } => {
@@ -1503,7 +1533,7 @@ impl PageServerHandler {
                        self.handle_get_slru_segment_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -2149,11 +2179,15 @@ impl PageServerHandler {
        timeline: &Timeline,
        requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
        io_concurrency: IoConcurrency,
-        _batch_break_reason: GetPageBatchBreakReason,
+        batch_break_reason: GetPageBatchBreakReason,
        ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        timeline
+            .query_metrics
+            .observe_getpage_batch_start(requests.len(), batch_break_reason);
+
        // If a page trace is running, submit an event for this request.
        if let Some(page_trace) = timeline.page_trace.load().as_ref() {
            let time = SystemTime::now();
@@ -2253,7 +2287,7 @@ impl PageServerHandler {
                                req: req.req,
                                page,
                            }),
-                            
+                            req.timer,
                        )
                    })
                    .map_err(|e| BatchedPageStreamError {
@@ -2298,7 +2332,7 @@ impl PageServerHandler {
        timeline: &Timeline,
        requests: Vec<BatchedTestRequest>,
        _ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage,), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
        // real requests would do something with the timeline
        let mut results = Vec::with_capacity(requests.len());
        for _req in requests.iter() {
@@ -2324,6 +2358,7 @@ impl PageServerHandler {
                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
                                req: req.req.clone(),
                            }),
+                            req.timer,
                        )
                    })
                    .map_err(|e| BatchedPageStreamError {
@@ -2359,6 +2394,7 @@ impl PageServerHandler {
        full_backup: bool,
        gzip: bool,
        replica: bool,
+        lazy_slru_download: bool,
        ctx: &RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -2426,6 +2462,7 @@ impl PageServerHandler {
                prev_lsn,
                full_backup,
                replica,
+                lazy_slru_download,
                &ctx,
            )
            .await
@@ -2449,6 +2486,7 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
+                    lazy_slru_download,
                    &ctx,
                )
                .await
@@ -2466,6 +2504,7 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
+                    lazy_slru_download,
                    &ctx,
                )
                .await
@@ -2515,7 +2554,7 @@ impl PageServerHandler {
    }
 }

-/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
+/// `basebackup tenant timeline [lsn] [--gzip] [--replica] [--lazy-slru-download]`
 #[derive(Debug, Clone, Eq, PartialEq)]
 struct BaseBackupCmd {
    tenant_id: TenantId,
@@ -2523,6 +2562,7 @@ struct BaseBackupCmd {
    lsn: Option<Lsn>,
    gzip: bool,
    replica: bool,
+    lazy_slru_download: bool,
 }

 /// `fullbackup tenant timeline [lsn] [prev_lsn]`
@@ -2655,6 +2695,7 @@ impl BaseBackupCmd {

        let mut gzip = false;
        let mut replica = false;
+        let mut lazy_slru_download = false;

        for &param in &parameters[flags_parse_from..] {
            match param {
@@ -2670,6 +2711,12 @@ impl BaseBackupCmd {
                    }
                    replica = true
                }
+                "--lazy-slru-download" => {
+                    if lazy_slru_download {
+                        bail!("duplicate parameter for basebackup command: {param}")
+                    }
+                    lazy_slru_download = true
+                }
                _ => bail!("invalid parameter for basebackup command: {param}"),
            }
        }
@@ -2679,6 +2726,7 @@ impl BaseBackupCmd {
            lsn,
            gzip,
            replica,
+            lazy_slru_download,
        })
    }
 }
@@ -2878,7 +2926,12 @@ where
                    .record("timeline_id", field::display(timeline_id));

                self.check_permission(Some(tenant_id))?;
-                
+                let command_kind = match protocol_version {
+                    PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
+                    PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
+                };
+                COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
+
                self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
                    .await?;
            }
@@ -2888,6 +2941,7 @@ where
                lsn,
                gzip,
                replica,
+                lazy_slru_download,
            }) => {
                tracing::Span::current()
                    .record("tenant_id", field::display(tenant_id))
@@ -2895,7 +2949,10 @@ where

                self.check_permission(Some(tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Basebackup)
+                    .inc();
+                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
                let res = async {
                    self.handle_basebackup_request(
                        pgb,
@@ -2906,6 +2963,7 @@ where
                        false,
                        gzip,
                        replica,
+                        lazy_slru_download,
                        &ctx,
                    )
                    .await?;
@@ -2913,7 +2971,7 @@ where
                    Result::<(), QueryError>::Ok(())
                }
                .await;
-
+                metric_recording.observe(&res);
                res?;
            }
            // same as basebackup, but result includes relational data as well
@@ -2929,7 +2987,9 @@ where

                self.check_permission(Some(tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Fullbackup)
+                    .inc();

                // Check that the timeline exists
                self.handle_basebackup_request(
@@ -2941,6 +3001,7 @@ where
                    true,
                    false,
                    false,
+                    false,
                    &ctx,
                )
                .await?;
@@ -2963,7 +3024,9 @@ where

                self.check_permission(Some(tenant_shard_id.tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::LeaseLsn)
+                    .inc();

                match self
                    .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
@@ -3073,7 +3136,8 @@ mod tests {
                timeline_id,
                lsn: None,
                gzip: false,
-                replica: false
+                replica: false,
+                lazy_slru_download: false
            })
        );
        let cmd =
@@ -3085,7 +3149,8 @@ mod tests {
                timeline_id,
                lsn: None,
                gzip: true,
-                replica: false
+                replica: false,
+                lazy_slru_download: false
            })
        );
        let cmd =
@@ -3097,7 +3162,8 @@ mod tests {
                timeline_id,
                lsn: None,
                gzip: false,
-                replica: false
+                replica: false,
+                lazy_slru_download: false
            })
        );
        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
@@ -3109,7 +3175,8 @@ mod tests {
                timeline_id,
                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
                gzip: false,
-                replica: false
+                replica: false,
+                lazy_slru_download: false
            })
        );
        let cmd = PageServiceCmd::parse(&format!(
@@ -3123,7 +3190,23 @@ mod tests {
                timeline_id,
                lsn: None,
                gzip: true,
-                replica: true
+                replica: true,
+                lazy_slru_download: false
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --replica --gzip --lazy-slru-download"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: true,
+                replica: true,
+                lazy_slru_download: true
            })
        );
        let cmd = PageServiceCmd::parse(&format!(
@@ -3137,7 +3220,8 @@ mod tests {
                timeline_id,
                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
                gzip: true,
-                replica: true
+                replica: true,
+                lazy_slru_download: false
            })
        );
        let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,13 +36,15 @@ use tracing::{debug, info, info_span, trace, warn};
 use utils::bin_ser::{BeSer, DeserializeError};
 use utils::lsn::Lsn;
 use utils::pausable_failpoint;
-use wal_decoder::serialized_batch::SerializedValueBatch ;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
 use crate::context::{PerfInstrumentFutureExt, RequestContext};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-
+use crate::metrics::{
+    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
+};
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -1030,16 +1032,19 @@ impl Timeline {
            )
            .await?;
        let mut result = HashMap::new();
-
+        let mut sz = 0;
        for (_, v) in kv {
            let v = v?;
            let v = aux_file::decode_file_value_bytes(&v)
                .context("value decode")
                .map_err(PageReconstructError::Other)?;
            for (fname, content) in v {
+                sz += fname.len();
+                sz += content.len();
                result.insert(fname, content);
            }
        }
+        self.aux_file_size_estimator.on_initial(sz);
        Ok(result)
    }

@@ -1310,12 +1315,12 @@ impl Timeline {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
-                
+                RELSIZE_CACHE_HITS.inc();
                return Some(*nblocks);
            }
-            
+            RELSIZE_CACHE_MISSES_OLD.inc();
        }
-       
+        RELSIZE_CACHE_MISSES.inc();
        None
    }

@@ -1340,21 +1345,25 @@ impl Timeline {
            }
            hash_map::Entry::Vacant(entry) => {
                entry.insert((lsn, nblocks));
-               
+                RELSIZE_CACHE_ENTRIES.inc();
            }
        }
    }

    /// Store cached relation size
-    pub fn set_cached_rel_size(&self, _tag: RelTag, _lsn: Lsn, _nblocks: BlockNumber) {
-       
-        
+    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_CACHE_ENTRIES.inc();
+        }
    }

    /// Remove cached relation size
-    pub fn remove_cached_rel_size(&self, _tag: &RelTag) {
-        
-        
+    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        if rel_size_cache.map.remove(tag).is_some() {
+            RELSIZE_CACHE_ENTRIES.dec();
+        }
    }
 }

@@ -1429,7 +1438,25 @@ impl DatadirModification<'_> {
            .is_some_and(|b| b.has_data())
    }

-    
+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }

    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
@@ -2304,15 +2331,20 @@ impl DatadirModification<'_> {
        }
        let mut new_files = other_files;
        match (modifying_file, content.is_empty()) {
-            (Some(_old_content), false) => {
-                
+            (Some(old_content), false) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_update(old_content.len(), content.len());
                new_files.push((path, content));
            }
-            (Some(_old_content), true) => {
-               
+            (Some(old_content), true) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_remove(old_content.len());
                // not adding the file key to the final `new_files` vec.
            }
            (None, false) => {
+                self.tline.aux_file_size_estimator.on_add(content.len());
                new_files.push((path, content));
            }
            // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -83,6 +83,11 @@ use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
+use crate::metrics::{
+    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
+    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
+    TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
+};
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
 use crate::tenant::gc_result::GcResult;
@@ -1353,7 +1358,7 @@ impl Tenant {
                let starting_up = init_order.is_some();
                scopeguard::defer! {
                    if starting_up {
-                       
+                        TENANT.startup_complete.inc();
                    }
                }

@@ -1456,7 +1461,7 @@ impl Tenant {

                let preload = match &mode {
                    SpawnMode::Eager | SpawnMode::Lazy => {
-                      
+                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
                            .preload(&remote_storage, task_mgr::shutdown_token())
                            .await;
@@ -1478,7 +1483,7 @@ impl Tenant {
                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                let attach_start = std::time::Instant::now();
                let attached = {
-                
+                    let _attach_timer = Some(TENANT.attach.start_timer());
                    tenant_clone.attach(preload, &ctx).await
                };
                let attach_duration = attach_start.elapsed();
@@ -3180,7 +3185,7 @@ impl Tenant {
        self.compaction_circuit_breaker
            .lock()
            .unwrap()
-            .success();
+            .success(&CIRCUIT_BREAKERS_UNBROKEN);

        match has_pending {
            true => Ok(CompactionOutcome::Pending),
@@ -3201,13 +3206,13 @@ impl Tenant {
                self.compaction_circuit_breaker
                    .lock()
                    .unwrap()
-                    .fail( err);
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
            }
            CompactionError::Other(err) => {
                self.compaction_circuit_breaker
                    .lock()
                    .unwrap()
-                    .fail( err);
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
            }
            CompactionError::AlreadyRunning(_) => {}
        }
@@ -3387,7 +3392,7 @@ impl Tenant {
                    "activation attempt finished"
                );

-              
+                TENANT.activation.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -3512,6 +3517,7 @@ impl Tenant {
        // Wait for any in-flight operations to complete
        self.gate.close().await;

+        remove_tenant_metrics(&self.tenant_shard_id);

        Ok(())
    }
@@ -3844,13 +3850,33 @@ impl Tenant {
    }

    pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
-         TopTenantShardItem {
+        let mut result = TopTenantShardItem {
            id: self.tenant_shard_id,
            resident_size: 0,
            physical_size: 0,
            max_logical_size: 0,
            max_logical_size_per_shard: 0,
+        };
+
+        for timeline in self.timelines.lock().unwrap().values() {
+            result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
+
+            result.physical_size += timeline
+                .remote_client
+                .metrics
+                .remote_physical_size_gauge
+                .get();
+            result.max_logical_size = std::cmp::max(
+                result.max_logical_size,
+                timeline.metrics.current_logical_size_gauge.get(),
+            );
        }
+
+        result.max_logical_size_per_shard = result
+            .max_logical_size
+            .div_ceil(self.tenant_shard_id.shard_count.count() as u64);
+
+        result
    }
 }

@@ -4217,15 +4243,55 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
-            
-            loop {
+            // reflect tenant state in metrics:
+            // - global per tenant state: TENANT_STATE_METRIC
+            // - "set" of broken tenants: BROKEN_TENANTS_SET
+            //
+            // set of broken tenants should not have zero counts so that it remains accessible for
+            // alerting.

+            let tid = tenant_shard_id.to_string();
+            let shard_id = tenant_shard_id.shard_slug().to_string();
+            let set_key = &[tid.as_str(), shard_id.as_str()][..];
+
+            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
+                ([state.into()], matches!(state, TenantState::Broken { .. }))
+            }
+
+            let mut tuple = inspect_state(&rx.borrow_and_update());
+
+            let is_broken = tuple.1;
+            let mut counted_broken = if is_broken {
+                // add the id to the set right away, there should not be any updates on the channel
+                // after before tenant is removed, if ever
+                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
+                true
+            } else {
+                false
+            };
+
+            loop {
+                let labels = &tuple.0;
+                let current = TENANT_STATE_METRIC.with_label_values(labels);
+                current.inc();

                if rx.changed().await.is_err() {
-                   
+                    // tenant has been dropped
+                    current.dec();
+                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
                    break;
                }

+                current.dec();
+                tuple = inspect_state(&rx.borrow_and_update());
+
+                let is_broken = tuple.1;
+                if is_broken && !counted_broken {
+                    counted_broken = true;
+                    // insert the tenant_id (back) into the set while avoiding needless counter
+                    // access
+                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
+                }
            }
        });

@@ -4600,6 +4666,10 @@ impl Tenant {
                let now = SystemTime::now();
                target.leases.retain(|_, lease| !lease.is_expired(&now));

+                timeline
+                    .metrics
+                    .valid_lsn_lease_count_gauge
+                    .set(target.leases.len() as u64);

                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
@@ -4609,6 +4679,22 @@ impl Tenant {
                    }
                }

+                // Update metrics that depend on GC state
+                timeline
+                    .metrics
+                    .archival_size
+                    .set(if target.within_ancestor_pitr {
+                        timeline.metrics.current_logical_size_gauge.get()
+                    } else {
+                        0
+                    });
+                timeline.metrics.pitr_history_size.set(
+                    timeline
+                        .get_last_record_lsn()
+                        .checked_sub(target.cutoffs.time)
+                        .unwrap_or(Lsn(0))
+                        .0,
+                );

                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
                // - this timeline was created while we were finding cutoffs
@@ -5358,6 +5444,10 @@ impl Tenant {
        // Only shard zero should be calculating synthetic sizes
        debug_assert!(self.shard_identity.is_shard_zero());

+        TENANT_SYNTHETIC_SIZE_METRIC
+            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
+            .unwrap()
+            .set(size);
    }

    pub fn cached_synthetic_size(&self) -> u64 {
@@ -5434,7 +5524,16 @@ impl Tenant {
    /// than they report here, due to layer eviction.  Tenants with many active branches may
    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
-        1000
+        let timelines = self.timelines.lock().unwrap();
+
+        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
+        // reflects the observation that on tenants with multiple large branches, typically only one
+        // of them is used actively enough to occupy space on disk.
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .max()
+            .unwrap_or(0)
    }

    /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
@@ -5518,11 +5617,16 @@ async fn run_initdb(
    );

    let _permit = {
-       
+        let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
        INIT_DB_SEMAPHORE.acquire().await
    };

+    CONCURRENT_INITDBS.inc();
+    scopeguard::defer! {
+        CONCURRENT_INITDBS.dec();
+    }

+    let _timer = INITDB_RUN_TIME.start_timer();
    let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
        superuser: &conf.superuser,
        locale: &conf.locale,
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -37,63 +37,6 @@ pub struct CompressionInfo {
    pub compressed_size: Option<usize>,
 }

-/// A blob header, with header+data length and compression info.
-///
-/// TODO: use this more widely, and add an encode() method too.
-/// TODO: document the header format.
-#[derive(Clone, Copy, Default)]
-pub struct Header {
-    pub header_len: usize,
-    pub data_len: usize,
-    pub compression_bits: u8,
-}
-
-impl Header {
-    /// Decodes a header from a byte slice.
-    pub fn decode(bytes: &[u8]) -> Result<Self, std::io::Error> {
-        let Some(&first_header_byte) = bytes.first() else {
-            return Err(std::io::Error::new(
-                std::io::ErrorKind::InvalidData,
-                "zero-length blob header",
-            ));
-        };
-
-        // If the first bit is 0, this is just a 1-byte length prefix up to 128 bytes.
-        if first_header_byte < 0x80 {
-            return Ok(Self {
-                header_len: 1, // by definition
-                data_len: first_header_byte as usize,
-                compression_bits: BYTE_UNCOMPRESSED,
-            });
-        }
-
-        // Otherwise, this is a 4-byte header containing compression information and length.
-        const HEADER_LEN: usize = 4;
-        let mut header_buf: [u8; HEADER_LEN] = bytes[0..HEADER_LEN].try_into().map_err(|_| {
-            std::io::Error::new(
-                std::io::ErrorKind::InvalidData,
-                format!("blob header too short: {bytes:?}"),
-            )
-        })?;
-
-        // TODO: verify the compression bits and convert to an enum.
-        let compression_bits = header_buf[0] & LEN_COMPRESSION_BIT_MASK;
-        header_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
-        let data_len = u32::from_be_bytes(header_buf) as usize;
-
-        Ok(Self {
-            header_len: HEADER_LEN,
-            data_len,
-            compression_bits,
-        })
-    }
-
-    /// Returns the total header+data length.
-    pub fn total_len(&self) -> usize {
-        self.header_len + self.data_len
-    }
-}
-
 impl BlockCursor<'_> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,7 +44,7 @@ use crate::controller_upcall_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
-use crate::metrics::TENANT_MANAGER as METRICS;
+use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
@@ -519,7 +519,7 @@ pub async fn init_tenant_mgr(
        tenant_configs.len(),
        conf.concurrent_tenant_warmup.initial_permits()
    );
-
+    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);

    // Accumulate futures for writing tenant configs, so that we can execute in parallel
    let mut config_write_futs = Vec::new();
@@ -2177,7 +2177,9 @@ impl TenantManager {
                        // we would use if not doing any eviction.
                        progress.bytes_total
                    } else {
-                        42
+                        // In the absence of heatmap info, assume that the secondary location simply
+                        // needs as much space as it is currently using.
+                        secondary.resident_size_metric.get()
                    }
                }
            }
@@ -2528,7 +2530,7 @@ impl SlotGuard {
                Ok(())
            }
            None => {
-              
+                METRICS.unexpected_errors.inc();
                error!(
                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during tenant upsert, this is a bug."
@@ -2538,7 +2540,7 @@ impl SlotGuard {
                ))
            }
            Some(slot) => {
-               
+                METRICS.unexpected_errors.inc();
                error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during upsert, this is a bug.  Contents: {:?}", slot);
                Err(TenantSlotUpsertError::InternalError(
                    "Unexpected contents of TenantSlot".into(),
@@ -2619,7 +2621,7 @@ impl Drop for SlotGuard {
        match m.entry(self.tenant_shard_id) {
            Entry::Occupied(mut entry) => {
                if !matches!(entry.get(), TenantSlot::InProgress(_)) {
-                    
+                    METRICS.unexpected_errors.inc();
                    error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during drop, this is a bug.  Contents: {:?}", entry.get());
                }

@@ -2634,7 +2636,7 @@ impl Drop for SlotGuard {
                }
            }
            Entry::Vacant(_) => {
-                
+                METRICS.unexpected_errors.inc();
                error!(
                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during SlotGuard drop, this is a bug."
@@ -2694,7 +2696,7 @@ fn tenant_map_acquire_slot_impl(
    mode: TenantSlotAcquireMode,
 ) -> Result<SlotGuard, TenantSlotError> {
    use TenantSlotAcquireMode::*;
-  
+    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -223,8 +223,9 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
-    MeasureRemoteOp, 
-    RemoteOpFileKind, RemoteOpKind, 
+    MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+    RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
+    RemoteTimelineClientMetricsCallTrackSize,
 };
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token};
 use crate::tenant::metadata::TimelineMetadata;
@@ -356,6 +357,8 @@ pub(crate) struct RemoteTimelineClient {

    upload_queue: Mutex<UploadQueue>,

+    pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
+
    storage_impl: GenericRemoteStorage,

    deletion_queue_client: DeletionQueueClient,
@@ -402,6 +405,10 @@ impl RemoteTimelineClient {
            storage_impl: remote_storage,
            deletion_queue_client,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                &tenant_shard_id,
+                &timeline_id,
+            )),
            config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
            cancel: CancellationToken::new(),
        }
@@ -590,13 +597,21 @@ impl RemoteTimelineClient {
            .map_err(|_| UploadQueueNotReadyError)
    }

-    fn update_remote_physical_size_gauge(&self, _current_remote_index_part: Option<&IndexPart>) {
-
-        
+    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
+        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
+            current_remote_index_part
+                .layer_metadata
+                .values()
+                .map(|ilmd| ilmd.file_size)
+                .sum()
+        } else {
+            0
+        };
+        self.metrics.remote_physical_size_gauge.set(size);
    }

    pub fn get_remote_physical_size(&self) -> u64 {
-  0
+        self.metrics.remote_physical_size_gauge.get()
    }

    //
@@ -611,6 +626,13 @@ impl RemoteTimelineClient {
        &self,
        cancel: &CancellationToken,
    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
+        let _unfinished_gauge_guard = self.metrics.call_begin(
+            &RemoteOpFileKind::Index,
+            &RemoteOpKind::Download,
+            crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                reason: "no need for a downloads gauge",
+            },
+        );

        let (index_part, index_generation, index_last_modified) = download::download_index_part(
            &self.storage_impl,
@@ -623,7 +645,7 @@ impl RemoteTimelineClient {
            Option::<TaskKind>::None,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
-        
+            Arc::clone(&self.metrics),
        )
        .await?;

@@ -698,7 +720,13 @@ impl RemoteTimelineClient {
        ctx: &RequestContext,
    ) -> Result<u64, DownloadError> {
        let downloaded_size = {
-        
+            let _unfinished_gauge_guard = self.metrics.call_begin(
+                &RemoteOpFileKind::Layer,
+                &RemoteOpKind::Download,
+                crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                    reason: "no need for a downloads gauge",
+                },
+            );
            download::download_layer_file(
                self.conf,
                &self.storage_impl,
@@ -715,11 +743,13 @@ impl RemoteTimelineClient {
                Some(ctx.task_kind()),
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
-          
+                Arc::clone(&self.metrics),
            )
            .await?
        };

+        REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
+        REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);

        Ok(downloaded_size)
    }
@@ -997,6 +1027,7 @@ impl RemoteTimelineClient {
        let op = UploadOp::UploadMetadata {
            uploaded: Box::new(index_part.clone()),
        };
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

@@ -1234,6 +1265,7 @@ impl RemoteTimelineClient {
        );

        let op = UploadOp::UploadLayer(layer, metadata, None);
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -1410,6 +1442,7 @@ impl RemoteTimelineClient {
        let op = UploadOp::Delete(Delete {
            layers: with_metadata,
        });
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -2147,7 +2180,7 @@ impl RemoteTimelineClient {
                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
-                      
+                        Arc::clone(&self.metrics),
                    )
                    .await
                }
@@ -2164,7 +2197,7 @@ impl RemoteTimelineClient {
                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
-                    
+                        Arc::clone(&self.metrics),
                    )
                    .await;
                    if res.is_ok() {
@@ -2310,7 +2343,10 @@ impl RemoteTimelineClient {
                    upload_queue.clean.1 = Some(task.task_id);

                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
-                   
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);
+
                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
                        upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -2351,6 +2387,64 @@ impl RemoteTimelineClient {
                .await;
        }

+        self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
+    }
+
+    fn metric_impl(
+        &self,
+        op: &UploadOp,
+    ) -> Option<(
+        RemoteOpFileKind,
+        RemoteOpKind,
+        RemoteTimelineClientMetricsCallTrackSize,
+    )> {
+        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
+        let res = match op {
+            UploadOp::UploadLayer(_, m, _) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Upload,
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
+            ),
+            UploadOp::UploadMetadata { .. } => (
+                RemoteOpFileKind::Index,
+                RemoteOpKind::Upload,
+                DontTrackSize {
+                    reason: "metadata uploads are tiny",
+                },
+            ),
+            UploadOp::Delete(_delete) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
+            UploadOp::Barrier(..) | UploadOp::Shutdown => {
+                // we do not account these
+                return None;
+            }
+        };
+        Some(res)
+    }
+
+    fn metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
+        guard.will_decrement_manually(); // in metric_end(), see right below
+    }
+
+    fn metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
    }

    /// Close the upload queue for new operations and cancel queued operations.
@@ -2430,6 +2524,7 @@ impl RemoteTimelineClient {

                // Tear down queued ops
                for op in qi.queued_operations.into_iter() {
+                    self.metric_end(&op);
                    // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                    // which is exactly what we want to happen.
                    drop(op);
@@ -2739,6 +2834,10 @@ mod tests {
                storage_impl: self.harness.remote_storage.clone(),
                deletion_queue_client: self.harness.deletion_queue.new_client(),
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &self.harness.tenant_shard_id,
+                    &TIMELINE_ID,
+                )),
                config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
                cancel: CancellationToken::new(),
            })
@@ -2965,7 +3064,99 @@ mod tests {
        );
    }

-    
+    #[tokio::test]
+    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
+        // Setup
+
+        let TestSetup {
+            harness,
+            tenant: _tenant,
+            timeline,
+            ..
+        } = TestSetup::new("metrics").await.unwrap();
+        let client = &timeline.remote_client;
+
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let local_path = local_layer_path(
+            harness.conf,
+            &timeline.tenant_shard_id,
+            &timeline.timeline_id,
+            &layer_file_name_1,
+            &harness.generation,
+        );
+        let content_1 = dummy_contents("foo");
+        std::fs::write(&local_path, &content_1).unwrap();
+
+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            local_path,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
+        );
+
+        #[derive(Debug, PartialEq, Clone, Copy)]
+        struct BytesStartedFinished {
+            started: Option<usize>,
+            finished: Option<usize>,
+        }
+        impl std::ops::Add for BytesStartedFinished {
+            type Output = Self;
+            fn add(self, rhs: Self) -> Self::Output {
+                Self {
+                    started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
+                    finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
+                }
+            }
+        }
+        let get_bytes_started_stopped = || {
+            let started = client
+                .metrics
+                .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            let stopped = client
+                .metrics
+                .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            BytesStartedFinished {
+                started,
+                finished: stopped,
+            }
+        };
+
+        // Test
+        tracing::info!("now doing actual test");
+
+        let actual_a = get_bytes_started_stopped();
+
+        client
+            .schedule_layer_file_upload(layer_file_1.clone())
+            .unwrap();
+
+        let actual_b = get_bytes_started_stopped();
+
+        client.wait_completion().await.unwrap();
+
+        let actual_c = get_bytes_started_stopped();
+
+        // Validate
+
+        let expected_b = actual_a
+            + BytesStartedFinished {
+                started: Some(content_1.len()),
+                // assert that the _finished metric is created eagerly so that subtractions work on first sample
+                finished: Some(0),
+            };
+        assert_eq!(actual_b, expected_b);
+
+        let expected_c = actual_a
+            + BytesStartedFinished {
+                started: Some(content_1.len()),
+                finished: Some(content_1.len()),
+            };
+        assert_eq!(actual_c, expected_c);
+    }
+
    async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
        // An empty IndexPart, just sufficient to ensure deserialization will succeed
        let example_index_part = IndexPart::example();
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -6,6 +6,7 @@ mod scheduler;
 use std::sync::Arc;
 use std::time::SystemTime;

+use metrics::UIntGauge;
 use pageserver_api::models;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use remote_storage::GenericRemoteStorage;
@@ -25,6 +26,7 @@ use super::span::debug_assert_current_span_has_tenant_id;
 use super::storage_layer::LayerName;
 use crate::context::RequestContext;
 use crate::disk_usage_eviction_task::DiskUsageEvictionInfo;
+use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};

 enum DownloadCommand {
@@ -107,7 +109,12 @@ pub(crate) struct SecondaryTenant {

    // Public state indicating overall progress of downloads relative to the last heatmap seen
    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
-   
+
+    // Sum of layer sizes on local disk
+    pub(super) resident_size_metric: UIntGauge,
+
+    // Sum of layer sizes in the most recently downloaded heatmap
+    pub(super) heatmap_total_size_metric: UIntGauge,
 }

 impl SecondaryTenant {
@@ -117,8 +124,16 @@ impl SecondaryTenant {
        tenant_conf: pageserver_api::models::TenantConfig,
        config: &SecondaryLocationConfig,
    ) -> Arc<Self> {
-    
-    
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
+        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -135,10 +150,14 @@ impl SecondaryTenant {

            progress: std::sync::Mutex::default(),

+            resident_size_metric,
+            heatmap_total_size_metric,
        })
    }

-    
+    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
+        self.tenant_shard_id
+    }

    pub(crate) async fn shutdown(&self) {
        self.cancel.cancel();
@@ -150,10 +169,15 @@ impl SecondaryTenant {

        // Metrics are subtracted from and/or removed eagerly.
        // Deletions are done in the background via [`BackgroundPurges::spawn`].
+        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
+        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+
        self.detail
            .lock()
            .unwrap()
-            .drain_timelines(&self.tenant_shard_id);
+            .drain_timelines(&self.tenant_shard_id, &self.resident_size_metric);
    }

    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
@@ -231,7 +255,7 @@ impl SecondaryTenant {
            // of the cache.
            let mut detail = this.detail.lock().unwrap();
            if let Some(removed) =
-                detail.evict_layer(name, &timeline_id, now)
+                detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
            {
                // We might race with removal of the same layer during downloads, so finding the layer we
                // were trying to remove is optional.  Only issue the disk I/O to remove it if we found it.
@@ -245,9 +269,10 @@ impl SecondaryTenant {
    /// Exhaustive check that incrementally updated metrics match the actual state.
    #[cfg(feature = "testing")]
    fn validate_metrics(&self) {
-        
+        let detail = self.detail.lock().unwrap();
+        let resident_size = detail.total_resident_size();

-        
+        assert_eq!(resident_size, self.resident_size_metric.get());
    }

    #[cfg(not(feature = "testing"))]
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -4,9 +4,11 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};

+use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation};
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
+use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage};
@@ -31,6 +33,7 @@ use crate::context::RequestContext;
 use crate::disk_usage_eviction_task::{
    DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32,
 };
+use crate::metrics::SECONDARY_MODE;
 use crate::tenant::config::SecondaryLocationConfig;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::ephemeral_file::is_ephemeral_file;
@@ -117,6 +120,9 @@ impl OnDiskState {
            .fatal_err("Deleting secondary layer")
    }

+    pub(crate) fn file_size(&self) -> u64 {
+        self.metadata.file_size
+    }
 }

 pub(super) struct SecondaryDetailTimeline {
@@ -169,9 +175,13 @@ impl SecondaryDetailTimeline {
    pub(super) fn remove_layer(
        &mut self,
        name: &LayerName,
+        resident_metric: &UIntGauge,
    ) -> Option<OnDiskState> {
-        self.on_disk_layers.remove(name)
-        
+        let removed = self.on_disk_layers.remove(name);
+        if let Some(removed) = &removed {
+            resident_metric.sub(removed.file_size());
+        }
+        removed
    }

    /// `local_path`
@@ -181,6 +191,7 @@ impl SecondaryDetailTimeline {
        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        touched: &HeatMapLayer,
+        resident_metric: &UIntGauge,
        local_path: F,
    ) where
        F: FnOnce() -> Utf8PathBuf,
@@ -200,6 +211,7 @@ impl SecondaryDetailTimeline {
                    touched.access_time,
                    local_path(),
                ));
+                resident_metric.add(touched.metadata.file_size);
            }
        }
    }
@@ -255,16 +267,28 @@ impl SecondaryDetail {
        }
    }

+    #[cfg(feature = "testing")]
+    pub(crate) fn total_resident_size(&self) -> u64 {
+        self.timelines
+            .values()
+            .map(|tl| {
+                tl.on_disk_layers
+                    .values()
+                    .map(|v| v.metadata.file_size)
+                    .sum::<u64>()
+            })
+            .sum::<u64>()
+    }

    pub(super) fn evict_layer(
        &mut self,
        name: LayerName,
        timeline_id: &TimelineId,
        now: SystemTime,
-
+        resident_metric: &UIntGauge,
    ) -> Option<OnDiskState> {
        let timeline = self.timelines.get_mut(timeline_id)?;
-        let removed = timeline.remove_layer(&name);
+        let removed = timeline.remove_layer(&name, resident_metric);
        if removed.is_some() {
            timeline.evicted_at.insert(name, now);
        }
@@ -273,21 +297,52 @@ impl SecondaryDetail {

    pub(super) fn remove_timeline(
        &mut self,
-        _tenant_shard_id: &TenantShardId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
+        resident_metric: &UIntGauge,
    ) {
-        self.timelines.remove(timeline_id);
-        
+        let removed = self.timelines.remove(timeline_id);
+        if let Some(removed) = removed {
+            Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric);
+        }
    }

    pub(super) fn drain_timelines(
        &mut self,
-        _tenant_shard_id: &TenantShardId,
-
+        tenant_shard_id: &TenantShardId,
+        resident_metric: &UIntGauge,
    ) {
-        
+        for (timeline_id, removed) in self.timelines.drain() {
+            Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric);
+        }
    }

+    fn clear_timeline_metrics(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        detail: SecondaryDetailTimeline,
+        resident_metric: &UIntGauge,
+    ) {
+        resident_metric.sub(
+            detail
+                .on_disk_layers
+                .values()
+                .map(|l| l.metadata.file_size)
+                .sum(),
+        );
+
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let timeline_id = timeline_id.to_string();
+        for op in StorageIoSizeOperation::VARIANTS {
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[
+                op,
+                tenant_id.as_str(),
+                shard_id.as_str(),
+                timeline_id.as_str(),
+            ]);
+        }
+    }

    /// Additionally returns the total number of layers, used for more stable relative access time
    /// based eviction.
@@ -742,6 +797,7 @@ impl<'a> TenantDownloader<'a> {
                        tenant_shard_id,
                        last_heatmap,
                        timeline,
+                        &self.secondary_state.resident_size_metric,
                        ctx,
                    )
                    .await;
@@ -864,7 +920,11 @@ impl<'a> TenantDownloader<'a> {
            bytes_downloaded: 0,
        };

-       
+        // Also expose heatmap bytes_total as a metric
+        self.secondary_state
+            .heatmap_total_size_metric
+            .set(heatmap_stats.bytes);
+
        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
        let mut delete_layers = Vec::new();
        let mut delete_timelines = Vec::new();
@@ -931,6 +991,7 @@ impl<'a> TenantDownloader<'a> {
                detail.remove_timeline(
                    self.secondary_state.get_tenant_shard_id(),
                    delete_timeline,
+                    &self.secondary_state.resident_size_metric,
                );
            }
        }
@@ -949,7 +1010,7 @@ impl<'a> TenantDownloader<'a> {
            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                continue;
            };
-            timeline_state.remove_layer(&layer_name);
+            timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
        }

        for timeline_id in delete_timelines {
@@ -1016,7 +1077,7 @@ impl<'a> TenantDownloader<'a> {
        .await
        .ok_or_else(|| UpdateError::Cancelled)
        .and_then(|x| x)
-        .inspect(|_|{} )
+        .inspect(|_| SECONDARY_MODE.download_heatmap.inc())
    }

    /// Download heatmap layers that are not present on local disk, or update their
@@ -1191,6 +1252,7 @@ impl<'a> TenantDownloader<'a> {
                    tenant_shard_id,
                    &timeline_id,
                    &t,
+                    &self.secondary_state.resident_size_metric,
                    || {
                        local_layer_path(
                            self.conf,
@@ -1302,6 +1364,7 @@ impl<'a> TenantDownloader<'a> {
            progress.layers_downloaded += 1;
        }

+        SECONDARY_MODE.download_layer.inc();

        Ok(Some(layer))
    }
@@ -1313,6 +1376,7 @@ async fn init_timeline_state(
    tenant_shard_id: &TenantShardId,
    last_heatmap: Option<&HeatMapTimeline>,
    heatmap: &HeatMapTimeline,
+    resident_metric: &UIntGauge,
    ctx: &RequestContext,
 ) -> SecondaryDetailTimeline {
    let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id);
@@ -1416,6 +1480,7 @@ async fn init_timeline_state(
                                tenant_shard_id,
                                &heatmap.timeline_id,
                                remote_meta,
+                                resident_metric,
                                || file_path,
                            );
                        }
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -20,6 +20,7 @@ use super::scheduler::{
 };
 use super::{CommandRequest, SecondaryTenantError, UploadCommand};
 use crate::TEMP_FILE_SUFFIX;
+use crate::metrics::SECONDARY_MODE;
 use crate::tenant::Tenant;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::mgr::{GetTenantError, TenantManager};
@@ -220,10 +221,14 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            // Guard for the barrier in [`WriteInProgress`]
            let _completion = completion;

-            
+            let started_at = Instant::now();
            let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
                Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
-                    
+                    let duration = Instant::now().duration_since(started_at);
+                    SECONDARY_MODE
+                        .upload_heatmap_duration
+                        .observe(duration.as_secs_f64());
+                    SECONDARY_MODE.upload_heatmap.inc();
                    Some(uploaded)
                }
                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
@@ -232,8 +237,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                        "Failed to upload heatmap for tenant {}: {e:#}",
                        tenant.get_tenant_shard_id(),
                    );
-                   
-
+                    let duration = Instant::now().duration_since(started_at);
+                    SECONDARY_MODE
+                        .upload_heatmap_duration
+                        .observe(duration.as_secs_f64());
+                    SECONDARY_MODE.upload_heatmap_errors.inc();
                    last_upload
                }
                Err(UploadHeatmapError::Cancelled) => {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -882,6 +882,14 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;

+        // Calculate compression ratio
+        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
+            .inc_by(self.uncompressed_bytes_eligible);
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
+        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
+
        let mut file = self.blob_writer.into_inner();

        // Write out the index
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -32,6 +32,7 @@ use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
+use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
 use crate::tenant::timeline::GetVectoredError;
@@ -306,7 +307,11 @@ impl GlobalResourceUnits {
            }
        };

-       
+        // This is a sloppy update: concurrent updates to the counter will race, and the exact
+        // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
+        // That's okay: as long as the metric contains some recent value, it doesn't have to always
+        // be literally the last update.
+        TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);

        self.dirty_bytes = size;

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -231,7 +231,9 @@ impl Layer {

        debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());

-        
+        timeline
+            .metrics
+            .resident_physical_size_add(metadata.file_size);

        ResidentLayer { downloaded, owner }
    }
@@ -524,6 +526,12 @@ impl Layer {
                }
            }

+            // Update the timeline's visible bytes count
+            if let Some(tl) = self.0.timeline.upgrade() {
+                tl.metrics
+                    .visible_physical_size_gauge
+                    .add(self.0.desc.file_size)
+            }
        }
    }

@@ -532,10 +540,23 @@ impl Layer {
        use LayerVisibilityHint::*;
        match (old_visibility, visibility) {
            (Visible, Covered) => {
-                
+                // Subtract this layer's contribution to the visible size metric
+                if let Some(tl) = self.0.timeline.upgrade() {
+                    debug_assert!(
+                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
+                    );
+                    tl.metrics
+                        .visible_physical_size_gauge
+                        .sub(self.0.desc.file_size)
+                }
            }
            (Covered, Visible) => {
-                
+                // Add this layer's contribution to the visible size metric
+                if let Some(tl) = self.0.timeline.upgrade() {
+                    tl.metrics
+                        .visible_physical_size_gauge
+                        .add(self.0.desc.file_size)
+                }
            }
            (Covered, Covered) | (Visible, Visible) => {
                // no change
@@ -588,6 +609,7 @@ impl ResidentOrWantedEvicted {
            ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
            ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
                Some(strong) => {
+                    LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();

                    *self = ResidentOrWantedEvicted::Resident(strong.clone());

@@ -719,8 +741,17 @@ enum Status {

 impl Drop for LayerInner {
    fn drop(&mut self) {
-        
-        let timeline: Option<Arc<Timeline>> = self.timeline.upgrade();
+        // if there was a pending eviction, mark it cancelled here to balance metrics
+        if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
+        {
+            // eviction has already been started
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+
+            // eviction request is intentionally not honored as no one is present to wait for it
+            // and we could be delaying shutdown for nothing.
+        }
+
+        let timeline = self.timeline.upgrade();

        if let Some(timeline) = timeline.as_ref() {
            // Only need to decrement metrics if the timeline still exists: otherwise
@@ -728,6 +759,13 @@ impl Drop for LayerInner {
            timeline.metrics.dec_layer(&self.desc);

            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                debug_assert!(
+                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
+                );
+                timeline
+                    .metrics
+                    .visible_physical_size_gauge
+                    .sub(self.desc.file_size);
            }
        }

@@ -739,6 +777,7 @@ impl Drop for LayerInner {

        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().layer_name();
+        let file_size = self.layer_desc().file_size;
        let meta = self.metadata();
        let status = self.status.take();

@@ -747,13 +786,20 @@ impl Drop for LayerInner {

            // carry this until we are finished for [`Layer::wait_drop`] support
            let _status = status;
+
            let Some(timeline) = timeline else {
                // no need to nag that timeline is gone: under normal situation on
                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
                return;
            };

-           match std::fs::remove_file(path) {
+            let Ok(_guard) = timeline.gate.enter() else {
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                return;
+            };
+
+            let removed = match std::fs::remove_file(path) {
                Ok(()) => true,
                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                    // until we no longer do detaches by removing all local files before removing the
@@ -764,16 +810,34 @@ impl Drop for LayerInner {
                    // layers.
                    false
                }
-                Err(_e) => {
+                Err(e) => {
+                    tracing::error!("failed to remove wanted deleted layer: {e}");
+                    LAYER_IMPL_METRICS.inc_delete_removes_failed();
                    false
                }
            };

-            
-            let _a=timeline
+            if removed {
+                timeline.metrics.resident_physical_size_sub(file_size);
+            }
+            let res = timeline
                .remote_client
                .schedule_deletion_of_unlinked(vec![(file_name, meta)]);

+            if let Err(e) = res {
+                // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                // demonstrating this deadlock (without spawn_blocking): stop will drop
+                // queued items, which will have ResidentLayer's, and those drops would try
+                // to re-entrantly lock the RemoteTimelineClient inner state.
+                if !timeline.is_active() {
+                    tracing::info!("scheduling deletion on drop failed: {e:#}");
+                } else {
+                    tracing::warn!("scheduling deletion on drop failed: {e:#}");
+                }
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            } else {
+                LAYER_IMPL_METRICS.inc_completed_deletes();
+            }
        });
    }
 }
@@ -804,6 +868,12 @@ impl LayerInner {
        // This object acts as a RAII guard on these metrics: increment on construction
        timeline.metrics.inc_layer(&desc);

+        // New layers are visible by default. This metric is later updated on drop or in set_visibility
+        timeline
+            .metrics
+            .visible_physical_size_gauge
+            .add(desc.file_size);
+
        LayerInner {
            conf,
            path: local_path,
@@ -824,9 +894,13 @@ impl LayerInner {
    }

    fn delete_on_drop(&self) {
-                    let _a=self.wanted_deleted
+        let res =
+            self.wanted_deleted
                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);

+        if res.is_ok() {
+            LAYER_IMPL_METRICS.inc_started_deletes();
+        }
    }

    /// Cancellation safe, however dropping the future and calling this method again might result
@@ -864,6 +938,12 @@ impl LayerInner {
            // drop the DownloadedLayer outside of the holding the guard
            drop(strong);

+            // idea here is that only one evicter should ever get to witness a strong reference,
+            // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
+            // cancelled eviction and signal us, like it currently does.
+            //
+            // a second concurrent evict_and_wait will not see a strong reference.
+            LAYER_IMPL_METRICS.inc_started_evictions();
        }

        let changed = rx.changed();
@@ -903,13 +983,15 @@ impl LayerInner {
            // get_or_init_detached can:
            // - be fast (mutex lock) OR uncontested semaphore permit acquire
            // - be slow (wait for semaphore permit or closing)
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
            let locked = self
                .inner
                .get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
                .await
                .map(|mut guard| guard.get_and_upgrade().ok_or(guard));

-
+            scopeguard::ScopeGuard::into_inner(init_cancelled);

            match locked {
                // this path could had been a RwLock::read
@@ -922,7 +1004,8 @@ impl LayerInner {
                    // note that we also have dropped the Guard; this is fine, because we just made
                    // a state change and are holding a strong reference to be returned.
                    self.status.as_ref().unwrap().send_replace(Status::Resident);
-        
+                    LAYER_IMPL_METRICS
+                        .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);

                    return Ok(strong);
                }
@@ -949,7 +1032,8 @@ impl LayerInner {
            .upgrade()
            .ok_or(DownloadError::TimelineShutdown)?;

-        
+        // count cancellations, which currently remain largely unexpected
+        let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());

        // check if we really need to be downloaded: this can happen if a read access won the
        // semaphore before eviction.
@@ -961,6 +1045,7 @@ impl LayerInner {
            .await
            .map_err(DownloadError::PreStatFailed);

+        scopeguard::ScopeGuard::into_inner(init_cancelled);

        let needs_download = needs_download?;

@@ -971,7 +1056,7 @@ impl LayerInner {
            self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
                .await?;

-            
+            LAYER_IMPL_METRICS.inc_init_needed_no_download();

            return Ok(self.initialize_after_layer_is_on_disk(permit));
        };
@@ -1012,13 +1097,13 @@ impl LayerInner {
        async move {
            tracing::info!(%reason, "downloading on-demand");

-            
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
            let res = self
                .download_init_and_wait(timeline, permit, ctx.attached_child())
                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                .await?;

-      
+            scopeguard::ScopeGuard::into_inner(init_cancelled);
            Ok(res)
        }
        .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
@@ -1036,6 +1121,7 @@ impl LayerInner {
                    "unexpectedly on-demand downloading for task kind {:?}",
                    ctx.task_kind()
                );
+                crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();

                let really_error =
                    matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
@@ -1087,7 +1173,20 @@ impl LayerInner {

                let res = this.download_and_init(timeline, permit, &ctx).await;

-                let _a =tx.send(res);
+                if let Err(res) = tx.send(res) {
+                    match res {
+                        Ok(_res) => {
+                            tracing::debug!("layer initialized, but caller has been cancelled");
+                            LAYER_IMPL_METRICS.inc_init_completed_without_requester();
+                        }
+                        Err(e) => {
+                            tracing::info!(
+                                "layer file download failed, and caller has been cancelled: {e:?}"
+                            );
+                            LAYER_IMPL_METRICS.inc_download_failed_without_requester();
+                        }
+                    }
+                }
            }
            .in_current_span(),
        );
@@ -1139,9 +1238,21 @@ impl LayerInner {
                    }
                };
                tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
-    
+                timeline
+                    .metrics
+                    .resident_physical_size_add(self.desc.file_size);
                self.consecutive_failures.store(0, Ordering::Relaxed);

+                let since_last_eviction = self
+                    .last_evicted_at
+                    .lock()
+                    .unwrap()
+                    .take()
+                    .map(|ts| ts.elapsed());
+                if let Some(since_last_eviction) = since_last_eviction {
+                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                }
+
                self.access_stats.record_residence_event();

                Ok(self.initialize_after_layer_is_on_disk(permit))
@@ -1296,8 +1407,12 @@ impl LayerInner {

            tracing::debug!("eviction started");

-           let _a = self.wait_for_turn_and_evict(only_version).await;
-            
+            let res = self.wait_for_turn_and_evict(only_version).await;
+            // metrics: ignore the Ok branch, it is not done yet
+            if let Err(e) = res {
+                tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
+                LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
+            }
        };

        Self::spawn(start_evicting.instrument(span));
@@ -1417,13 +1532,21 @@ impl LayerInner {
        Self::spawn_blocking(move || {
            let _span = span.entered();

-            let res = self.evict_blocking( &gate, &permit);
+            let res = self.evict_blocking(&timeline, &gate, &permit);

            let waiters = self.inner.initializer_count();

-            
+            if waiters > 0 {
+                LAYER_IMPL_METRICS.inc_evicted_with_waiters();
+            }
+
            let completed_in = spawned_at.elapsed();
-           
+            LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
+
+            match res {
+                Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
+                Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
+            }

            tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
        });
@@ -1434,6 +1557,7 @@ impl LayerInner {
    /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
    fn evict_blocking(
        &self,
+        timeline: &Timeline,
        _gate: &gate::GateGuard,
        _permit: &heavier_once_cell::InitPermit,
    ) -> Result<(), EvictionCancelled> {
@@ -1446,7 +1570,17 @@ impl LayerInner {
                    Ok(elapsed) => {
                        let accessed_and_visible = self.access_stats.accessed()
                            && self.access_stats.visibility() == LayerVisibilityHint::Visible;
-                        
+                        if accessed_and_visible {
+                            // Only layers used for reads contribute to our "low residence" metric that is used
+                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
+                            // to be rapidly evicted without contributing to this metric.
+                            timeline
+                                .metrics
+                                .evictions_with_low_residence_duration
+                                .read()
+                                .unwrap()
+                                .observe(elapsed);
+                        }

                        tracing::info!(
                            residence_millis = elapsed.as_millis(),
@@ -1458,6 +1592,10 @@ impl LayerInner {
                        tracing::info!("evicted layer after unknown residence period");
                    }
                }
+                timeline.metrics.evictions.inc();
+                timeline
+                    .metrics
+                    .resident_physical_size_sub(self.desc.file_size);
            }
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                tracing::error!(
@@ -1674,7 +1812,13 @@ impl DownloadedLayer {
            match res {
                Ok(layer) => Ok(layer),
                Err(err) => {
-                    
+                    LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+                    // We log this message once over the lifetime of `Self`
+                    // => Ok and good to log backtrace and path here.
+                    tracing::error!(
+                        "layer load failed, assuming permanent failure: {}: {err:?}",
+                        owner.path
+                    );
                    Err(err)
                }
            }
@@ -1882,6 +2026,218 @@ impl From<ResidentLayer> for Layer {
    }
 }

+use metrics::IntCounter;
+
+pub(crate) struct LayerImplMetrics {
+    started_evictions: IntCounter,
+    completed_evictions: IntCounter,
+    cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
+
+    started_deletes: IntCounter,
+    completed_deletes: IntCounter,
+    failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
+
+    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
+    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
+    redownload_after: metrics::Histogram,
+    time_to_evict: metrics::Histogram,
+}
+
+impl Default for LayerImplMetrics {
+    fn default() -> Self {
+        use enum_map::Enum;
+
+        // reminder: these will be pageserver_layer_* with "_total" suffix
+
+        let started_evictions = metrics::register_int_counter!(
+            "pageserver_layer_started_evictions",
+            "Evictions started in the Layer implementation"
+        )
+        .unwrap();
+        let completed_evictions = metrics::register_int_counter!(
+            "pageserver_layer_completed_evictions",
+            "Evictions completed in the Layer implementation"
+        )
+        .unwrap();
+
+        let cancelled_evictions = metrics::register_int_counter_vec!(
+            "pageserver_layer_cancelled_evictions_count",
+            "Different reasons for evictions to have been cancelled or failed",
+            &["reason"]
+        )
+        .unwrap();
+
+        let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let reason = EvictionCancelled::from_usize(i);
+            let s = reason.as_str();
+            cancelled_evictions.with_label_values(&[s])
+        }));
+
+        let started_deletes = metrics::register_int_counter!(
+            "pageserver_layer_started_deletes",
+            "Deletions on drop pending in the Layer implementation"
+        )
+        .unwrap();
+        let completed_deletes = metrics::register_int_counter!(
+            "pageserver_layer_completed_deletes",
+            "Deletions on drop completed in the Layer implementation"
+        )
+        .unwrap();
+
+        let failed_deletes = metrics::register_int_counter_vec!(
+            "pageserver_layer_failed_deletes_count",
+            "Different reasons for deletions on drop to have failed",
+            &["reason"]
+        )
+        .unwrap();
+
+        let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let reason = DeleteFailed::from_usize(i);
+            let s = reason.as_str();
+            failed_deletes.with_label_values(&[s])
+        }));
+
+        let rare_counters = metrics::register_int_counter_vec!(
+            "pageserver_layer_assumed_rare_count",
+            "Times unexpected or assumed rare event happened",
+            &["event"]
+        )
+        .unwrap();
+
+        let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let event = RareEvent::from_usize(i);
+            let s = event.as_str();
+            rare_counters.with_label_values(&[s])
+        }));
+
+        let inits_cancelled = metrics::register_int_counter!(
+            "pageserver_layer_inits_cancelled_count",
+            "Times Layer initialization was cancelled",
+        )
+        .unwrap();
+
+        let redownload_after = {
+            let minute = 60.0;
+            let hour = 60.0 * minute;
+            metrics::register_histogram!(
+                "pageserver_layer_redownloaded_after",
+                "Time between evicting and re-downloading.",
+                vec![
+                    10.0,
+                    30.0,
+                    minute,
+                    5.0 * minute,
+                    15.0 * minute,
+                    30.0 * minute,
+                    hour,
+                    12.0 * hour,
+                ]
+            )
+            .unwrap()
+        };
+
+        let time_to_evict = metrics::register_histogram!(
+            "pageserver_layer_eviction_held_permit_seconds",
+            "Time eviction held the permit.",
+            vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
+        )
+        .unwrap();
+
+        Self {
+            started_evictions,
+            completed_evictions,
+            cancelled_evictions,
+
+            started_deletes,
+            completed_deletes,
+            failed_deletes,
+
+            rare_counters,
+            inits_cancelled,
+            redownload_after,
+            time_to_evict,
+        }
+    }
+}
+
+impl LayerImplMetrics {
+    fn inc_started_evictions(&self) {
+        self.started_evictions.inc();
+    }
+    fn inc_completed_evictions(&self) {
+        self.completed_evictions.inc();
+    }
+    fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
+        self.cancelled_evictions[reason].inc()
+    }
+
+    fn inc_started_deletes(&self) {
+        self.started_deletes.inc();
+    }
+    fn inc_completed_deletes(&self) {
+        self.completed_deletes.inc();
+    }
+    fn inc_deletes_failed(&self, reason: DeleteFailed) {
+        self.failed_deletes[reason].inc();
+    }
+
+    /// Counted separatedly from failed layer deletes because we will complete the layer deletion
+    /// attempt regardless of failure to delete local file.
+    fn inc_delete_removes_failed(&self) {
+        self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
+    }
+
+    /// Expected rare just as cancellations are rare, but we could have cancellations separate from
+    /// the single caller which can start the download, so use this counter to separte them.
+    fn inc_init_completed_without_requester(&self) {
+        self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
+    }
+
+    /// Expected rare because cancellations are unexpected, and failures are unexpected
+    fn inc_download_failed_without_requester(&self) {
+        self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
+    }
+
+    /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
+    ///
+    /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
+    /// Option.
+    fn inc_raced_wanted_evicted_accesses(&self) {
+        self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
+    }
+
+    /// These are only expected for [`Self::inc_init_cancelled`] amount when
+    /// running with remote storage.
+    fn inc_init_needed_no_download(&self) {
+        self.rare_counters[RareEvent::InitWithoutDownload].inc();
+    }
+
+    /// Expected rare because all layer files should be readable and good
+    fn inc_permanent_loading_failures(&self) {
+        self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
+    }
+
+    fn inc_init_cancelled(&self) {
+        self.inits_cancelled.inc()
+    }
+
+    fn record_redownloaded_after(&self, duration: std::time::Duration) {
+        self.redownload_after.observe(duration.as_secs_f64())
+    }
+
+    /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
+    /// instead cancel eviction if we would have read waiters. We cannot however separate reads
+    /// from other evictions, so this could have noise as well.
+    fn inc_evicted_with_waiters(&self) {
+        self.rare_counters[RareEvent::EvictedWithWaiters].inc();
+    }
+
+    /// Recorded at least initially as the permit is now acquired in async context before
+    /// spawn_blocking action.
+    fn record_time_to_evict(&self, duration: std::time::Duration) {
+        self.time_to_evict.observe(duration.as_secs_f64())
+    }
+}

 #[derive(Debug, Clone, Copy, enum_map::Enum)]
 enum EvictionCancelled {
@@ -1898,6 +2254,21 @@ enum EvictionCancelled {
    UnexpectedEvictedState,
 }

+impl EvictionCancelled {
+    fn as_str(&self) -> &'static str {
+        match self {
+            EvictionCancelled::LayerGone => "layer_gone",
+            EvictionCancelled::TimelineGone => "timeline_gone",
+            EvictionCancelled::VersionCheckFailed => "version_check_fail",
+            EvictionCancelled::FileNotFound => "file_not_found",
+            EvictionCancelled::RemoveFailed => "remove_failed",
+            EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
+            EvictionCancelled::LostToDownload => "lost_to_download",
+            EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
+            EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
+        }
+    }
+}

 #[derive(enum_map::Enum)]
 enum DeleteFailed {
@@ -1905,6 +2276,15 @@ enum DeleteFailed {
    DeleteSchedulingFailed,
 }

+impl DeleteFailed {
+    fn as_str(&self) -> &'static str {
+        match self {
+            DeleteFailed::TimelineGone => "timeline_gone",
+            DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
+        }
+    }
+}
+
 #[derive(enum_map::Enum)]
 enum RareEvent {
    RemoveOnDropFailed,
@@ -1916,3 +2296,21 @@ enum RareEvent {
    EvictedWithWaiters,
 }

+impl RareEvent {
+    fn as_str(&self) -> &'static str {
+        use RareEvent::*;
+
+        match self {
+            RemoveOnDropFailed => "remove_on_drop_failed",
+            InitCompletedWithoutRequester => "init_completed_without",
+            DownloadFailedWithoutRequester => "download_failed_without",
+            UpgradedWantedEvicted => "raced_wanted_evicted",
+            InitWithoutDownload => "init_needed_no_download",
+            PermanentLoadingFailure => "permanent_loading_failure",
+            EvictedWithWaiters => "evicted_with_waiters",
+        }
+    }
+}
+
+pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
+    once_cell::sync::Lazy::new(LayerImplMetrics::default);
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -238,7 +238,7 @@ async fn smoke_test() {
        rtc.get_remote_physical_size(),
        dummy_layer.metadata().file_size
    );
-   
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -311,6 +311,11 @@ async fn evict_and_wait_on_wanted_deleted() {

    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;

+    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 /// This test ensures we are able to read the layer while the layer eviction has been
@@ -361,7 +366,7 @@ fn read_wins_pending_eviction() {
        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
            .await
            .expect_err("should had been a timeout since we are holding the layer resident");
-       
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());

        let (completion, barrier) = utils::completion::channel();
        let (arrival, arrived_at_barrier) = utils::completion::channel();
@@ -393,7 +398,18 @@ fn read_wins_pending_eviction() {

        // works as intended: evictions lose to "downloads"
        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-        
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );

        drop(completion);

@@ -401,9 +417,26 @@ fn read_wins_pending_eviction() {
        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
            .await;

-        
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());

-        
+        // now we finally can observe the original eviction failing
+        // it would had been possible to observe it earlier, but here it is guaranteed to have
+        // happened.
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
    });
 }

@@ -466,7 +499,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
            .await
            .expect_err("should had been a timeout since we are holding the layer resident");
-        
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());

        let (completion1, barrier) = utils::completion::channel();
        let mut completion1 = Some(completion1);
@@ -501,9 +534,20 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {

        // works as intended: evictions lose to "downloads"
        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-       
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());

-       
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());

        // configure another failpoint for the second eviction -- evictions are per initialization,
        // so now that we've reinitialized the inner, we get to run two of them at the same time.
@@ -523,10 +567,13 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {

        arrived_at_barrier.wait().await;

-       
+        assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());

-        let mut release_earlier_eviction = |_expected_reason| {
-            
+        let mut release_earlier_eviction = |expected_reason| {
+            assert_eq!(
+                0,
+                LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+            );

            drop(completion1.take().unwrap());

@@ -539,7 +586,10 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
                )
                .await;

-                
+                assert_eq!(
+                    1,
+                    LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+                );
            }
        };

@@ -562,7 +612,19 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
            .expect("eviction goes through now that spawn_blocking is unclogged")
            .expect("eviction should succeed, because version matches");

-       
+        assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // ensure the cancelled are unchanged
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
    });
 }

@@ -652,7 +714,8 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
        .unwrap_err();
    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");

-    
+    // failpoint is not counted as cancellation either
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 #[tokio::test(start_paused = true)]
@@ -829,7 +892,8 @@ async fn eviction_cancellation_on_drop() {
                .expect_err("should had been a timeout since we are holding the layer resident");
        }

-        
+        // 1 == we only evict one of the layers
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());

        drop(resident);

@@ -838,7 +902,10 @@ async fn eviction_cancellation_on_drop() {

        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;

-        
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
+        );
    }
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -10,7 +10,7 @@ use std::time::{Duration, Instant};
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
 use rand::Rng;
-
+use scopeguard::defer;
 use tokio::sync::{Semaphore, SemaphorePermit};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -19,6 +19,7 @@ use utils::completion::Barrier;
 use utils::pausable_failpoint;

 use crate::context::{DownloadBehavior, RequestContext};
+use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
@@ -84,15 +85,17 @@ pub(crate) enum BackgroundLoopKind {
    SecondaryDownload,
 }

-pub struct BackgroundLoopSemaphorePermit {
+pub struct BackgroundLoopSemaphorePermit<'a> {
    _permit: SemaphorePermit<'static>,
+    _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
 }

 /// Acquires a semaphore permit, to limit concurrent background jobs.
 pub(crate) async fn acquire_concurrency_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-) -> BackgroundLoopSemaphorePermit {
+) -> BackgroundLoopSemaphorePermit<'static> {
+    let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);

    if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
        pausable_failpoint!("initial-size-calculation-permit-pause");
@@ -105,9 +108,11 @@ pub(crate) async fn acquire_concurrency_permit(
    };
    let permit = semaphore.acquire().await.expect("should never close");

+    recorder.acquired();

    BackgroundLoopSemaphorePermit {
        _permit: permit,
+        _recorder: recorder,
    }
 }

@@ -130,7 +135,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
-
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                compaction_loop(tenant, cancel)
                    // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -155,6 +161,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                gc_loop(tenant, cancel)
                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
@@ -178,7 +186,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
-
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                tenant_housekeeping_loop(tenant, cancel)
                    .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
@@ -583,5 +592,8 @@ pub(crate) fn warn_when_period_overrun(
            ?task,
            "task iteration took longer than the configured period"
        );
+        metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
+            .with_label_values(&[task.into(), &format!("{}", period.as_secs())])
+            .inc();
    }
 }
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -45,8 +45,8 @@ pub struct Stats {
 }

 pub enum ThrottleResult {
-    NotThrottled {  },
-    Throttled {  },
+    NotThrottled { end: Instant },
+    Throttled { end: Instant },
 }

 impl Throttle {
@@ -114,7 +114,7 @@ impl Throttle {
        let inner = self.inner.load_full(); // clones the `Inner` Arc

        if !inner.enabled {
-            return ThrottleResult::NotThrottled { };
+            return ThrottleResult::NotThrottled { end: start };
        }

        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
@@ -127,9 +127,9 @@ impl Throttle {
            let wait_time = end - start;
            self.sum_throttled_usecs
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
-            ThrottleResult::Throttled {  }
+            ThrottleResult::Throttled { end }
        } else {
-            ThrottleResult::NotThrottled { }
+            ThrottleResult::NotThrottled { end: start }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -96,6 +96,7 @@ use super::{
    AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
    debug_assert_current_span_has_tenant_and_timeline_id,
 };
+use crate::aux_file::AuxFileSizeEstimator;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -103,7 +104,10 @@ use crate::context::{
 use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{
+    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
+    LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
+};
 use crate::page_service::TenantManagerTypes;
 use crate::pgdatadir_mapping::{
    CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp,
@@ -411,6 +415,8 @@ pub struct Timeline {
    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
    pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,

+    /// Size estimator for aux file v2
+    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,

    /// Some test cases directly place keys into the timeline without actually modifying the directory
    /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
@@ -1204,20 +1210,20 @@ impl Timeline {
            ctx.task_kind(),
        );

-        // let start = crate::metrics::GET_VECTORED_LATENCY
-        //     .for_task_kind(ctx.task_kind())
-        //     .map(|metric| (metric, Instant::now()));
+        let start = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|metric| (metric, Instant::now()));

-         self
+        let res = self
            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
-            .await
+            .await;

-        // if let Some((metric, start)) = start {
-        //     let elapsed = start.elapsed();
-        //     metric.observe(elapsed.as_secs_f64());
-        // }
+        if let Some((metric, start)) = start {
+            let elapsed = start.elapsed();
+            metric.observe(elapsed.as_secs_f64());
+        }

-        
+        res
    }

    /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored
@@ -1256,21 +1262,21 @@ impl Timeline {
            }
        }

-        // let start = crate::metrics::SCAN_LATENCY
-        //     .for_task_kind(ctx.task_kind())
-        //     .map(ScanLatencyOngoingRecording::start_recording);
+        let start = crate::metrics::SCAN_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(ScanLatencyOngoingRecording::start_recording);

        let query = VersionedKeySpaceQuery::uniform(keyspace, lsn);

-         self
+        let vectored_res = self
            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
-            .await
+            .await;

-        // if let Some(recording) = start {
-        //     recording.observe();
-        // }
+        if let Some(recording) = start {
+            recording.observe();
+        }

-       
+        vectored_res
    }

    pub(super) async fn get_vectored_impl(
@@ -1379,7 +1385,7 @@ impl Timeline {
                            return (key, Err(err));
                        }
                    };
-                   
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);

                    // The walredo module expects the records to be descending in terms of Lsn.
                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
@@ -1417,21 +1423,42 @@ impl Timeline {
        // when they're missing. Instead they are omitted from the resulting btree
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
-        if !results.is_empty() && layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
-            let total_keyspace = query.total_keyspace();
-            let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
+        if !results.is_empty() {
+            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                let total_keyspace = query.total_keyspace();
+                let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");

-            static LOG_PACER: Lazy<Mutex<RateLimit>> =
-                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-            LOG_PACER.lock().unwrap().call(|| {
-                let num_keys = total_keyspace.total_raw_size();
-                let num_pages = results.len();
-                tracing::info!(
-                  shard_id = %self.tenant_shard_id.shard_slug(),
-                  lsn = %max_request_lsn,
-                  "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
-                );
-            });
+                static LOG_PACER: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
+                LOG_PACER.lock().unwrap().call(|| {
+                    let num_keys = total_keyspace.total_raw_size();
+                    let num_pages = results.len();
+                    tracing::info!(
+                      shard_id = %self.tenant_shard_id.shard_slug(),
+                      lsn = %max_request_lsn,
+                      "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                    );
+                });
+            }
+
+            // Records the number of layers visited in a few different ways:
+            //
+            // * LAYERS_PER_READ: all layers count towards every read in the batch, because each
+            //   layer directly affects its observed latency.
+            //
+            // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
+            //   layer visits and access cost.
+            //
+            // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
+            //   read amplification after batching.
+            let layers_visited = layers_visited as f64;
+            let avg_layers_visited = layers_visited / results.len() as f64;
+            LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
+            for _ in &results {
+                self.metrics.layers_per_read.observe(layers_visited);
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited);
+                LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
+            }
        }

        Ok(results)
@@ -1483,6 +1510,9 @@ impl Timeline {
        guard.layer_size_sum()
    }

+    pub(crate) fn resident_physical_size(&self) -> u64 {
+        self.metrics.resident_physical_size_get()
+    }

    pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
        array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
@@ -1545,6 +1575,8 @@ impl Timeline {
            WaitLsnTimeout::Default => self.conf.wait_lsn_timeout,
        };

+        let timer = crate::metrics::WAIT_LSN_TIME.start_timer();
+        let start_finish_counterpair_guard = self.metrics.wait_lsn_start_finish_counterpair.guard();

        let wait_for_timeout = self.last_record_lsn.wait_for_timeout(lsn, timeout);
        let wait_for_timeout = std::pin::pin!(wait_for_timeout);
@@ -1561,8 +1593,11 @@ impl Timeline {
                 ready,
                 is_slow,
                 elapsed_total,
-                 elapsed_since_last_callback: _,
+                 elapsed_since_last_callback,
             }| {
+                self.metrics
+                    .wait_lsn_in_progress_micros
+                    .inc_by(u64::try_from(elapsed_since_last_callback.as_micros()).unwrap());
                if !is_slow {
                    return;
                }
@@ -1592,6 +1627,8 @@ impl Timeline {
        let res = wait_for_timeout.await;
        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
        drop(logging_permit);
+        drop(start_finish_counterpair_guard);
+        drop(timer);
        match res {
            Ok(()) => Ok(()),
            Err(e) => {
@@ -2453,12 +2490,11 @@ impl Timeline {
        tenant_conf.is_gc_blocked_by_lsn_lease_deadline()
    }

-    pub(crate) fn get_lazy_slru_download(&self) -> bool {
+    pub(crate) fn get_lazy_slru_download(&self, lazy_slru_download_enabled_by_cp: bool) -> bool {
        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .lazy_slru_download
-            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
+        tenant_conf.tenant_conf.lazy_slru_download.unwrap_or(
+            lazy_slru_download_enabled_by_cp || self.conf.default_tenant_conf.lazy_slru_download,
+        )
    }

    /// Checks if a get page request should get perf tracing
@@ -2665,14 +2701,6 @@ impl Timeline {
            .clone()
    }

-    pub fn get_compaction_shard_ancestor(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .compaction_shard_ancestor
-            .unwrap_or(self.conf.default_tenant_conf.compaction_shard_ancestor)
-    }
-
    fn get_eviction_policy(&self) -> EvictionPolicy {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2681,6 +2709,15 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
    }

+    fn get_evictions_low_residence_duration_metric_threshold(
+        tenant_conf: &pageserver_api::models::TenantConfig,
+        default_tenant_conf: &pageserver_api::config::TenantConfigToml,
+    ) -> Duration {
+        tenant_conf
+            .evictions_low_residence_duration_metric_threshold
+            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
+    }
+
    fn get_image_layer_creation_check_threshold(&self) -> u8 {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2756,8 +2793,28 @@ impl Timeline {

        // The threshold is embedded in the metric. So, we need to update it.
        {
+            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
+                &new_conf.tenant_conf,
+                &self.conf.default_tenant_conf,
+            );
+
+            let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
+            let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
+
+            let timeline_id_str = self.timeline_id.to_string();
+
            self.remote_client.update_config(&new_conf.location);

+            self.metrics
+                .evictions_with_low_residence_duration
+                .write()
+                .unwrap()
+                .change_threshold(
+                    &tenant_id_str,
+                    &shard_id_str,
+                    &timeline_id_str,
+                    new_threshold,
+                );
        }
    }

@@ -2791,6 +2848,13 @@ impl Timeline {
        let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));

+        let evictions_low_residence_duration_metric_threshold = {
+            let loaded_tenant_conf = tenant_conf.load();
+            Self::get_evictions_low_residence_duration_metric_threshold(
+                &loaded_tenant_conf.tenant_conf,
+                &conf.default_tenant_conf,
+            )
+        };

        if let Some(ancestor) = &ancestor {
            let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
@@ -2803,7 +2867,12 @@ impl Timeline {
            let metrics = Arc::new(TimelineMetrics::new(
                &tenant_shard_id,
                &timeline_id,
+                crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                    "mtime",
+                    evictions_low_residence_duration_metric_threshold,
+                ),
            ));
+            let aux_file_metrics = metrics.aux_file_size_gauge.clone();

            let mut result = Timeline {
                conf,
@@ -2911,6 +2980,8 @@ impl Timeline {

                pagestream_throttle: resources.pagestream_throttle,

+                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
+
                #[cfg(test)]
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),

@@ -2936,6 +3007,10 @@ impl Timeline {
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;

+            result
+                .metrics
+                .last_record_lsn_gauge
+                .set(disk_consistent_lsn.0 as i64);
            result
        })
    }
@@ -3091,6 +3166,8 @@ impl Timeline {

        let mut guard = self.layers.write().await;

+        let timer = self.metrics.load_layer_map_histo.start_timer();
+
        // Scan timeline directory and create ImageLayerName and DeltaFilename
        // structs representing all files on disk
        let timeline_path = self
@@ -3251,6 +3328,7 @@ impl Timeline {
            num_layers, disk_consistent_lsn, total_physical_size
        );

+        timer.stop_and_record();
        Ok(())
    }

@@ -3318,7 +3396,7 @@ impl Timeline {

        if let CurrentLogicalSize::Approximate(_) = &current_size {
            if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
-                let _= self
+                let first = self
                    .current_logical_size
                    .did_return_approximate_to_walreceiver
                    .compare_exchange(
@@ -3326,8 +3404,11 @@ impl Timeline {
                        true,
                        AtomicOrdering::Relaxed,
                        AtomicOrdering::Relaxed,
-                    ).is_ok();
-                
+                    )
+                    .is_ok();
+                if first {
+                    crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
+                }
            }
        }

@@ -3399,7 +3480,7 @@ impl Timeline {
            self.current_logical_size.initialized.add_permits(1);
        }

-        let try_once = |_attempt: usize| {
+        let try_once = |attempt: usize| {
            let background_ctx = &background_ctx;
            let self_ref = &self;
            let skip_concurrency_limiter = &skip_concurrency_limiter;
@@ -3410,7 +3491,7 @@ impl Timeline {
                );

                use crate::metrics::initial_logical_size::StartCircumstances;
-                let (_maybe_permit, _circumstances) = tokio::select! {
+                let (_maybe_permit, circumstances) = tokio::select! {
                    permit = wait_for_permit => {
                        (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                    }
@@ -3427,6 +3508,12 @@ impl Timeline {
                    }
                };

+                let metrics_guard = if attempt == 1 {
+                    crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
+                } else {
+                    crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
+                };
+
                let io_concurrency = IoConcurrency::spawn_from_conf(
                    self_ref.conf,
                    self_ref
@@ -3453,7 +3540,7 @@ impl Timeline {

                // TODO: add aux file size to logical size

-                Ok(calculated_size)
+                Ok((calculated_size, metrics_guard))
            }
        };

@@ -3490,14 +3577,27 @@ impl Timeline {
            }
        };

-        let calculated_size = match retrying.await {
+        let (calculated_size, metrics_guard) = match retrying.await {
            ControlFlow::Continue(calculated_size) => calculated_size,
            ControlFlow::Break(()) => return,
        };

+        // we cannot query current_logical_size.current_size() to know the current
+        // *negative* value, only truncated to u64.
+        let added = self
+            .current_logical_size
+            .size_added_after_initial
+            .load(AtomicOrdering::Relaxed);
+
+        let sum = calculated_size.saturating_add_signed(added);
+
+        // set the gauge value before it can be set in `update_current_logical_size`.
+        self.metrics.current_logical_size_gauge.set(sum);
+
        self.current_logical_size
            .initial_logical_size
-            .set((calculated_size,))
+            .set((calculated_size, metrics_guard.calculation_result_saved()))
+            .ok()
            .expect("only this task sets it");
    }

@@ -3562,7 +3662,7 @@ impl Timeline {
    async fn calculate_logical_size(
        &self,
        up_to_lsn: Lsn,
-        _cause: LogicalSizeCalculationCause,
+        cause: LogicalSizeCalculationCause,
        _guard: &GateGuard,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
@@ -3581,13 +3681,20 @@ impl Timeline {
        if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
            return Ok(size);
        }
-        
-       
+        let storage_time_metrics = match cause {
+            LogicalSizeCalculationCause::Initial
+            | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
+            | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
+            LogicalSizeCalculationCause::EvictionTaskImitation => {
+                &self.metrics.imitate_logical_size_histo
+            }
+        };
+        let timer = storage_time_metrics.start_timer();
        let logical_size = self
            .get_current_logical_size_non_incremental(up_to_lsn, ctx)
            .await?;
        debug!("calculated logical size: {logical_size}");
-      
+        timer.stop_and_record();
        Ok(logical_size)
    }

@@ -3596,6 +3703,21 @@ impl Timeline {
        let logical_size = &self.current_logical_size;
        logical_size.increment_size(delta);

+        // Also set the value in the prometheus gauge. Note that
+        // there is a race condition here: if this is is called by two
+        // threads concurrently, the prometheus gauge might be set to
+        // one value while current_logical_size is set to the
+        // other.
+        match logical_size.current_size() {
+            CurrentLogicalSize::Exact(ref new_current_size) => self
+                .metrics
+                .current_logical_size_gauge
+                .set(new_current_size.into()),
+            CurrentLogicalSize::Approximate(_) => {
+                // don't update the gauge yet, this allows us not to update the gauge back and
+                // forth between the initial size calculation task.
+            }
+        }
    }

    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
@@ -3633,8 +3755,26 @@ impl Timeline {
            }
        };

-        
-        
+        // TODO: remove this, there's no place in the code that updates this aux metrics.
+        let aux_metric =
+            self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
+
+        let sum_of_entries = self
+            .directory_metrics
+            .iter()
+            .map(|v| v.load(AtomicOrdering::Relaxed))
+            .sum();
+        // Set a high general threshold and a lower threshold for the auxiliary files,
+        // as we can have large numbers of relations in the db directory.
+        const SUM_THRESHOLD: u64 = 5000;
+        const AUX_THRESHOLD: u64 = 1000;
+        if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
+            self.metrics
+                .directory_entries_count_gauge
+                .set(sum_of_entries);
+        } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
+            metric.set(sum_of_entries);
+        }
    }

    async fn find_layer(
@@ -4354,6 +4494,8 @@ impl Timeline {

    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        assert!(new_lsn.is_aligned());
+
+        self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
        self.last_record_lsn.advance(new_lsn);
    }

@@ -4473,10 +4615,17 @@ impl Timeline {
                            "stalling layer flushes for compaction backpressure at {l0_count} \
                            L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
                        );
-                        
+                        let stall_timer = self
+                            .metrics
+                            .flush_delay_histo
+                            .start_timer()
+                            .record_on_drop();
                        tokio::select! {
-                            _result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => {
-                               
+                            result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => {
+                                if let Ok(l0) = result.as_deref() {
+                                    let delay = stall_timer.elapsed().as_secs_f64();
+                                    info!("resuming layer flushes at {l0} L0 layers after {delay:.3}s");
+                                }
                            },
                            _ = self.cancel.cancelled() => {},
                        }
@@ -4485,7 +4634,7 @@ impl Timeline {
                }

                // Flush the layer.
-                let flush_timer = Instant::now();
+                let flush_timer = self.metrics.flush_time_histo.start_timer();
                match self.flush_frozen_layer(layer, ctx).await {
                    Ok(layer_lsn) => flushed_to_lsn = max(flushed_to_lsn, layer_lsn),
                    Err(FlushLayerError::Cancelled) => {
@@ -4501,7 +4650,7 @@ impl Timeline {
                        break err.map(|_| ());
                    }
                }
-                let flush_duration = flush_timer.elapsed();
+                let flush_duration = flush_timer.stop_and_record();

                // Notify the tenant compaction loop if L0 compaction is needed.
                let l0_count = *watch_l0.borrow();
@@ -4519,7 +4668,11 @@ impl Timeline {
                            "delaying layer flush by {delay:.3}s for compaction backpressure at \
                            {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
                        );
-                        
+                        let _delay_timer = self
+                            .metrics
+                            .flush_delay_histo
+                            .start_timer()
+                            .record_on_drop();
                        tokio::select! {
                            _ = tokio::time::sleep(flush_duration) => {},
                            _ = watch_l0.wait_for(|l0| *l0 < delay_threshold) => {},
@@ -4761,6 +4914,9 @@ impl Timeline {
            "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"
        );

+        self.metrics
+            .disk_consistent_lsn_gauge
+            .set(new_value.0 as i64);
        new_value != old_value
    }

@@ -5246,7 +5402,7 @@ impl Timeline {
        last_status: LastImageLayerCreationStatus,
        yield_for_l0: bool,
    ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
-        
+        let timer = self.metrics.create_images_time_histo.start_timer();

        if partitioning.parts.is_empty() {
            warn!("no partitions to create image layers for");
@@ -5279,8 +5435,8 @@ impl Timeline {

        let mut all_generated = true;

-        
-        let total_partitions = partitioning.parts.len();
+        let mut partition_processed = 0;
+        let mut total_partitions = partitioning.parts.len();
        let mut last_partition_processed = None;
        let mut partition_parts = partitioning.parts.clone();

@@ -5305,7 +5461,7 @@ impl Timeline {
                        break; // with found=false
                    }
                    partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
-                
+                    total_partitions = partition_parts.len();
                    // Update the start key to the partition start.
                    start = partition_parts[0].start().unwrap();
                    found = true;
@@ -5322,6 +5478,7 @@ impl Timeline {
            if self.cancel.is_cancelled() {
                return Err(CreateImageLayersError::Cancelled);
            }
+            partition_processed += 1;
            let img_range = start..partition.ranges.last().unwrap().end;
            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
            if compact_metadata {
@@ -5484,16 +5641,28 @@ impl Timeline {
            .open_mut()?
            .track_new_image_layers(&image_layers, &self.metrics);
        drop_wlock(guard);
-        
+        let duration = timer.stop_and_record();

        // Creating image layers may have caused some previously visible layers to be covered
        if !image_layers.is_empty() {
            self.update_layer_visibility().await?;
        }

-   
+        let total_layer_size = image_layers
+            .iter()
+            .map(|l| l.metadata().file_size)
+            .sum::<u64>();

-        
+        if !image_layers.is_empty() {
+            info!(
+                "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+                image_layers.len(),
+                total_layer_size,
+                duration.as_secs_f64(),
+                partition_processed,
+                total_partitions
+            );
+        }

        Ok((
            image_layers,
@@ -6028,7 +6197,11 @@ impl Timeline {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<GcCutoffs, PageReconstructError> {
-        
+        let _timer = self
+            .metrics
+            .find_gc_cutoffs_histo
+            .start_timer()
+            .record_on_drop();

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

@@ -6095,7 +6268,7 @@ impl Timeline {
            guard = self.gc_lock.lock() => guard,
            _ = self.cancel.cancelled() => return Ok(GcResult::default()),
        };
-        
+        let timer = self.metrics.garbage_collect_histo.start_timer();

        fail_point!("before-timeline-gc");

@@ -6152,7 +6325,9 @@ impl Timeline {
        // It is an easy way to unset it when standby disappears without adding
        // more conf options.
        self.standby_horizon.store(Lsn::INVALID);
-       
+        self.metrics
+            .standby_horizon_gauge
+            .set(Lsn::INVALID.0 as i64);

        let res = self
            .gc_timeline(
@@ -6167,7 +6342,8 @@ impl Timeline {
            )
            .await?;

-       
+        // only record successes
+        timer.stop_and_record();

        Ok(res)
    }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1133,15 +1133,16 @@ impl Timeline {

        // 1. L0 Compact
        let l0_outcome = {
-            
-            self
+            let timer = self.metrics.compact_time_histo.start_timer();
+            let l0_outcome = self
                .compact_level0(
                    target_file_size,
                    options.flags.contains(CompactFlags::ForceL0Compaction),
                    ctx,
                )
-                .await?
-        
+                .await?;
+            timer.stop_and_record();
+            l0_outcome
        };

        if options.flags.contains(CompactFlags::OnlyL0Compaction) {
@@ -1238,7 +1239,8 @@ impl Timeline {
        let partition_count = self.partitioning.read().0.0.parts.len();

        // 4. Shard ancestor compaction
-        if self.get_compaction_shard_ancestor() && self.shard_identity.count >= ShardCount::new(2) {
+
+        if self.shard_identity.count >= ShardCount::new(2) {
            // Limit the number of layer rewrites to the number of partitions: this means its
            // runtime should be comparable to a full round of image layer creations, rather than
            // being potentially much longer.
@@ -4040,7 +4042,7 @@ impl TimelineAdaptor {
        key_range: &Range<Key>,
        ctx: &RequestContext,
    ) -> Result<(), CreateImageLayersError> {
-       
+        let timer = self.timeline.metrics.create_images_time_histo.start_timer();

        let image_layer_writer = ImageLayerWriter::new(
            self.timeline.conf,
@@ -4086,7 +4088,7 @@ impl TimelineAdaptor {
            self.new_images.push(image_layer);
        }

-        
+        timer.stop_and_record();

        Ok(())
    }
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -126,7 +126,7 @@ impl Timeline {
    ) -> ControlFlow<(), Instant> {
        debug!("eviction iteration: {policy:?}");
        let start = Instant::now();
-        let (period, _) = match policy {
+        let (period, threshold) = match policy {
            EvictionPolicy::NoEviction => {
                // check again in 10 seconds; XXX config watch mechanism
                return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
@@ -159,6 +159,16 @@ impl Timeline {
            period,
            BackgroundLoopKind::Eviction,
        );
+        // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I
+        // don't think that is a relevant fear however, and regardless the imitation should be the
+        // most costly part.
+        crate::metrics::EVICTION_ITERATION_DURATION
+            .get_metric_with_label_values(&[
+                &format!("{}", period.as_secs()),
+                &format!("{}", threshold.as_secs()),
+            ])
+            .unwrap()
+            .observe(elapsed.as_secs_f64());

        ControlFlow::Continue(start + period)
    }
@@ -315,7 +325,7 @@ impl Timeline {
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit> {
+    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
        let acquire_permit =
            crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx);

@@ -357,7 +367,7 @@ impl Timeline {
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
-        permit: BackgroundLoopSemaphorePermit,
+        permit: BackgroundLoopSemaphorePermit<'static>,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        if !self.tenant_shard_id.is_shard_zero() {
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -59,7 +59,11 @@ impl HeatmapLayersDownloader {
                    return;
                };

-                
+                tracing::info!(
+                    resident_size=%timeline.resident_physical_size(),
+                    heatmap_layers=%heatmap.all_layers().count(),
+                    "Starting heatmap layers download"
+                );

                let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
                    |layer| {
@@ -89,7 +93,7 @@ impl HeatmapLayersDownloader {
                tokio::select! {
                    _ = stream.collect::<()>() => {
                        tracing::info!(
-                            
+                            resident_size=%timeline.resident_physical_size(),
                            "Heatmap layers download completed"
                        );
                    },
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -290,7 +290,7 @@ impl OpenLayerManager {
        lsn: Lsn,
        last_freeze_at: &AtomicLsn,
        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) -> bool {
        let Lsn(last_record_lsn) = lsn;
        let end_lsn = Lsn(last_record_lsn + 1);
@@ -299,6 +299,10 @@ impl OpenLayerManager {
            let open_layer_rc = Arc::clone(open_layer);
            open_layer.freeze(end_lsn).await;

+            // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`.
+            // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a
+            // reference to the timeline metrics. Other methods use a metrics borrow as well.
+            metrics.inc_frozen_layer(open_layer);

            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
@@ -330,12 +334,16 @@ impl OpenLayerManager {
    pub(crate) fn track_new_image_layers(
        &mut self,
        image_layers: &[ResidentLayer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);

+            // record these here instead of Layer::finish_creating because otherwise partial
+            // failure with create_image_layers would balloon up the physical size gauge. downside
+            // is that all layers need to be created before metrics are updated.
+            metrics.record_new_file_metrics(layer.layer_desc().file_size);
        }
        updates.flush();
    }
@@ -345,13 +353,14 @@ impl OpenLayerManager {
        &mut self,
        delta_layer: Option<&ResidentLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let inmem = self
            .layer_map
            .frozen_layers
            .pop_front()
            .expect("there must be a inmem layer to flush");
+        metrics.dec_frozen_layer(&inmem);

        // Only one task may call this function at a time (for this
        // timeline). If two tasks tried to flush the same frozen
@@ -361,6 +370,7 @@ impl OpenLayerManager {
        if let Some(l) = delta_layer {
            let mut updates = self.layer_map.batch_update();
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
            updates.flush();
        }
    }
@@ -370,11 +380,12 @@ impl OpenLayerManager {
        &mut self,
        compact_from: &[Layer],
        compact_to: &[ResidentLayer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        for l in compact_from {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
@@ -427,7 +438,7 @@ impl OpenLayerManager {
        rewrite_layers: &[(Layer, ResidentLayer)],
        drop_layers: &[Layer],
        add_layers: &[ResidentLayer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for (old_layer, new_layer) in rewrite_layers {
@@ -458,12 +469,14 @@ impl OpenLayerManager {
                &mut self.layer_fmgr,
            );

+            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
        }
        for l in drop_layers {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
        for l in add_layers {
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        updates.flush();
    }
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -23,6 +23,7 @@ pub(super) struct LogicalSize {
    /// the initial size at a different LSN.
    pub initial_logical_size: OnceCell<(
        u64,
+        crate::metrics::initial_logical_size::FinishedCalculationGuard,
    )>,

    /// Cancellation for the best-effort logical size calculation.
@@ -129,7 +130,11 @@ impl CurrentLogicalSize {
 impl LogicalSize {
    pub(super) fn empty_initial() -> Self {
        Self {
-            initial_logical_size: OnceCell::with_value((0,)),
+            initial_logical_size: OnceCell::with_value((0, {
+                crate::metrics::initial_logical_size::START_CALCULATION
+                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
+                    .calculation_result_saved()
+            })),
            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
@@ -154,7 +159,7 @@ impl LogicalSize {
        //                  ^^^ keep this type explicit so that the casts in this function break if
        //                  we change the type.
        match self.initial_logical_size.get() {
-            Some((initial_size, )) => {
+            Some((initial_size, _)) => {
                CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
                    .unwrap()))
@@ -176,7 +181,7 @@ impl LogicalSize {
    /// available for re-use. This doesn't contain the incremental part.
    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, )| *s),
+            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
            _ => None,
        }
    }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -39,6 +39,10 @@ use utils::postgres_client::{
 use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
 use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::metrics::{
+    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
+    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
+};
 use crate::task_mgr::TaskKind;
 use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id};

@@ -72,6 +76,11 @@ pub(super) async fn connection_manager_loop_step(
        }
    }

+    WALRECEIVER_ACTIVE_MANAGERS.inc();
+    scopeguard::defer! {
+        WALRECEIVER_ACTIVE_MANAGERS.dec();
+    }
+
    let id = TenantTimelineId {
        tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
        timeline_id: connection_manager_state.timeline.timeline_id,
@@ -517,6 +526,9 @@ impl ConnectionManagerState {

    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
+        WALRECEIVER_SWITCHES
+            .with_label_values(&[new_sk.reason.name()])
+            .inc();

        self.drop_old_connection(true).await;

@@ -719,6 +731,8 @@ impl ConnectionManagerState {
            }
        };

+        WALRECEIVER_BROKER_UPDATES.inc();
+
        trace!(
            "safekeeper info update: standby_horizon(cutoff)={}",
            timeline_update.standby_horizon
@@ -728,6 +742,10 @@ impl ConnectionManagerState {
            self.timeline
                .standby_horizon
                .store(Lsn(timeline_update.standby_horizon));
+            self.timeline
+                .metrics
+                .standby_horizon_gauge
+                .set(timeline_update.standby_horizon as i64);
        }

        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -745,6 +763,7 @@ impl ConnectionManagerState {
                %new_safekeeper_id,
                "New SK node was added",
            );
+            WALRECEIVER_CANDIDATES_ADDED.inc();
        }
    }

@@ -1032,6 +1051,7 @@ impl ConnectionManagerState {
                    "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"
                );
                self.wal_connection_retries.remove(&node_id);
+                WALRECEIVER_CANDIDATES_REMOVED.inc();
            }
        }
    }
@@ -1058,7 +1078,6 @@ struct NewWalConnectionCandidate {
    safekeeper_id: NodeId,
    wal_source_connconf: PgConnectionConfig,
    availability_zone: Option<String>,
-    #[allow(dead_code)]
    reason: ReconnectReason,
 }

@@ -1087,6 +1106,18 @@ enum ReconnectReason {
    },
 }

+impl ReconnectReason {
+    fn name(&self) -> &str {
+        match self {
+            ReconnectReason::NoExistingConnection => "NoExistingConnection",
+            ReconnectReason::LaggingWal { .. } => "LaggingWal",
+            ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone",
+            ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout",
+            ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives",
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -36,6 +36,7 @@ use wal_decoder::wire_format::FromWireFormat;

 use super::TaskStateUpdate;
 use crate::context::RequestContext;
+use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS};
 use crate::pgdatadir_mapping::DatadirModification;
 use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::{
@@ -136,7 +137,7 @@ pub(super) async fn handle_walreceiver_connection(
        GateError::GateClosed => WalReceiverError::ClosedGate,
    })?;

-  
+    WALRECEIVER_STARTED_CONNECTIONS.inc();

    // Connect to the database in replication mode.
    info!("connecting to {wal_source_connconf:?}");
@@ -222,6 +223,10 @@ pub(super) async fn handle_walreceiver_connection(
        .instrument(tracing::info_span!("poller")),
    );

+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["wal_receiver"])
+        .guard();
+
    let identify = identify_system(&replication_client).await?;
    info!("{identify:?}");

@@ -339,7 +344,7 @@ pub(super) async fn handle_walreceiver_connection(

        let status_update = match replication_message {
            ReplicationMessage::RawInterpretedWalRecords(raw) => {
-               
+                WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);

                let mut uncommitted_records = 0;

@@ -412,13 +417,21 @@ pub(super) async fn handle_walreceiver_connection(
                    ctx: &RequestContext,
                    uncommitted: &mut u64,
                ) -> anyhow::Result<()> {
-                  
+                    let stats = modification.stats();
                    modification.commit(ctx).await?;
-                  
+                    WAL_INGEST.records_committed.inc_by(*uncommitted);
+                    WAL_INGEST.inc_values_committed(&stats);
                    *uncommitted = 0;
                    Ok(())
                }

+                if !records.is_empty() {
+                    timeline
+                        .metrics
+                        .wal_records_received
+                        .inc_by(records.len() as u64);
+                }
+
                for interpreted in records {
                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                        && uncommitted_records > 0
@@ -428,6 +441,9 @@ pub(super) async fn handle_walreceiver_connection(

                    let local_next_record_lsn = interpreted.next_record_lsn;

+                    if interpreted.is_observed() {
+                        WAL_INGEST.records_observed.inc();
+                    }

                    walingest
                        .ingest_record(interpreted, &mut modification, &ctx)
@@ -499,9 +515,12 @@ pub(super) async fn handle_walreceiver_connection(
                    filtered: &mut u64,
                    ctx: &RequestContext,
                ) -> anyhow::Result<()> {
-                   
+                    let stats = modification.stats();
                    modification.commit(ctx).await?;
-                   
+                    WAL_INGEST
+                        .records_committed
+                        .inc_by(*uncommitted - *filtered);
+                    WAL_INGEST.inc_values_committed(&stats);
                    *uncommitted = 0;
                    *filtered = 0;
                    Ok(())
@@ -515,7 +534,7 @@ pub(super) async fn handle_walreceiver_connection(

                trace!("received XLogData between {startlsn} and {endlsn}");

-            
+                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
                waldecoder.feed_bytes(data);

                {
@@ -557,6 +576,7 @@ pub(super) async fn handle_walreceiver_connection(
                        }

                        // Ingest the records without immediately committing them.
+                        timeline.metrics.wal_records_received.inc();
                        let ingested = walingest
                            .ingest_record(interpreted, &mut modification, &ctx)
                            .await
@@ -572,7 +592,7 @@ pub(super) async fn handle_walreceiver_connection(
                            })?;
                        if !ingested {
                            tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
-                          
+                            WAL_INGEST.records_filtered.inc();
                            filtered_records += 1;
                        }

--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -26,7 +26,7 @@ use utils::lsn::Lsn;
 use utils::vec_map::VecMap;

 use crate::context::RequestContext;
-use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, Header};
+use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
 use crate::virtual_file::{self, IoBufferMut, VirtualFile};

 /// Metadata bundled with the start and end offset of a blob.
@@ -111,20 +111,18 @@ impl From<Bytes> for BufView<'_> {
 pub struct VectoredBlob {
    /// Blob metadata.
    pub meta: BlobMeta,
-    /// Header start offset.
-    header_start: usize,
-    /// Data start offset.
-    data_start: usize,
+    /// Start offset.
+    start: usize,
    /// End offset.
    end: usize,
-    /// Compression used on the data, extracted from the header.
+    /// Compression used on the the blob.
    compression_bits: u8,
 }

 impl VectoredBlob {
    /// Reads a decompressed view of the blob.
    pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
-        let view = buf.view(self.data_start..self.end);
+        let view = buf.view(self.start..self.end);

        match self.compression_bits {
            BYTE_UNCOMPRESSED => Ok(view),
@@ -142,19 +140,13 @@ impl VectoredBlob {
                    std::io::ErrorKind::InvalidData,
                    format!(
                        "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}",
-                        self.meta.key, self.meta.lsn, self.data_start, self.end
+                        self.meta.key, self.meta.lsn, self.start, self.end
                    ),
                );
                Err(error)
            }
        }
    }
-
-    /// Returns the raw blob including header.
-    #[allow(unused)]
-    pub(crate) fn raw_with_header<'a>(&self, buf: &BufView<'a>) -> BufView<'a> {
-        buf.view(self.header_start..self.end)
-    }
 }

 impl std::fmt::Display for VectoredBlob {
@@ -162,7 +154,7 @@ impl std::fmt::Display for VectoredBlob {
        write!(
            f,
            "{}@{}, {}..{}",
-            self.meta.key, self.meta.lsn, self.data_start, self.end
+            self.meta.key, self.meta.lsn, self.start, self.end
        )
    }
 }
@@ -501,28 +493,50 @@ impl<'a> VectoredBlobReader<'a> {

        let blobs_at = read.blobs_at.as_slice();

-        let mut blobs = Vec::with_capacity(blobs_at.len());
+        let start_offset = read.start;
+
+        let mut metas = Vec::with_capacity(blobs_at.len());
        // Blobs in `read` only provide their starting offset. The end offset
        // of a blob is implicit: the start of the next blob if one exists
        // or the end of the read.

-        for (blob_start, meta) in blobs_at.iter().copied() {
-            let header_start = (blob_start - read.start) as usize;
-            let header = Header::decode(&buf[header_start..])?;
-            let data_start = header_start + header.header_len;
-            let end = data_start + header.data_len;
-            let compression_bits = header.compression_bits;
+        for (blob_start, meta) in blobs_at {
+            let blob_start_in_buf = blob_start - start_offset;
+            let first_len_byte = buf[blob_start_in_buf as usize];

-            blobs.push(VectoredBlob {
-                header_start,
-                data_start,
+            // Each blob is prefixed by a header containing its size and compression information.
+            // Extract the size and skip that header to find the start of the data.
+            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
+            // 1 byte case and 1 in the 4 byte case.
+            let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
+                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
+            } else {
+                let mut blob_size_buf = [0u8; 4];
+                let offset_in_buf = blob_start_in_buf as usize;
+
+                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
+                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
+
+                let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
+                (
+                    4,
+                    u32::from_be_bytes(blob_size_buf) as u64,
+                    compression_bits,
+                )
+            };
+
+            let start = (blob_start_in_buf + size_length) as usize;
+            let end = start + blob_size as usize;
+
+            metas.push(VectoredBlob {
+                start,
                end,
-                meta,
+                meta: *meta,
                compression_bits,
            });
        }

-        Ok(VectoredBlobsBuf { buf, blobs })
+        Ok(VectoredBlobsBuf { buf, blobs: metas })
    }
 }

@@ -983,15 +997,6 @@ mod tests {
                &read_buf[..],
                "mismatch for idx={idx} at offset={offset}"
            );
-
-            // Check that raw_with_header returns a valid header.
-            let raw = read_blob.raw_with_header(&view);
-            let header = Header::decode(&raw)?;
-            if !compression || header.header_len == 1 {
-                assert_eq!(header.compression_bits, BYTE_UNCOMPRESSED);
-            }
-            assert_eq!(raw.len(), header.total_len());
-
            buf = result.buf;
        }
        Ok(())
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -10,7 +10,7 @@ use pageserver_api::models::PageserverUtilization;
 use utils::serde_percent::Percent;

 use crate::config::PageServerConf;
-
+use crate::metrics::NODE_UTILIZATION_SCORE;
 use crate::tenant::mgr::TenantManager;

 pub(crate) fn regenerate(
@@ -53,7 +53,7 @@ pub(crate) fn regenerate(
    // Express a static value for how many shards we may schedule on one node
    const MAX_SHARDS: u32 = 5000;

-    let doc = PageserverUtilization {
+    let mut doc = PageserverUtilization {
        disk_usage_bytes: used,
        free_space_bytes: free,
        disk_wanted_bytes,
@@ -63,7 +63,10 @@ pub(crate) fn regenerate(
        utilization_score: None,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };
-    
+
+    // Initialize `PageserverUtilization::utilization_score`
+    let score = doc.cached_score();
+    NODE_UTILIZATION_SCORE.set(score);

    Ok(doc)
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -27,9 +27,12 @@ use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 pub use pageserver_api::models::virtual_file as api;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use tokio::time::Instant;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

+use crate::assert_u64_eq_usize::UsizeIsU64;
 use crate::context::RequestContext;
+use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
 use crate::page_cache::{PAGE_SZ, PageWriteGuard};
 pub(crate) mod io_engine;
 pub use io_engine::{
@@ -428,7 +431,9 @@ impl OpenFiles {
        if let Some(old_file) = slot_guard.file.take() {
            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
            // distinguish the two.
-            drop(old_file);
+            STORAGE_IO_TIME_METRIC
+                .get(StorageIoOperation::CloseByReplace)
+                .observe_closure_duration(|| drop(old_file));
        }

        // Prepare the slot for reuse and return it
@@ -527,9 +532,13 @@ impl<T> MaybeFatalIo<T> for std::io::Result<T> {
 /// where "support" means that we measure wall clock time.
 macro_rules! observe_duration {
    ($op:expr, $($body:tt)*) => {{
-        
-        $($body)*
-        
+        let instant = Instant::now();
+        let result = $($body)*;
+        let elapsed = instant.elapsed().as_secs_f64();
+        STORAGE_IO_TIME_METRIC
+            .get($op)
+            .observe(elapsed);
+        result
    }}
 }

@@ -904,7 +913,7 @@ impl VirtualFileInner {
        &self,
        buf: tokio_epoll_uring::Slice<Buf>,
        offset: u64,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
    where
        Buf: tokio_epoll_uring::IoBufMut + Send,
@@ -921,7 +930,9 @@ impl VirtualFileInner {
        observe_duration!(StorageIoOperation::Read, {
            let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
            let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
-            
+            if let Ok(size) = res {
+                ctx.io_size_metrics().read.add(size.into_u64());
+            }
            (buf, res)
        })
    }
@@ -942,7 +953,7 @@ impl VirtualFileInner {
        &self,
        buf: FullSlice<B>,
        offset: u64,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> (FullSlice<B>, Result<usize, Error>) {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
@@ -951,7 +962,9 @@ impl VirtualFileInner {
        observe_duration!(StorageIoOperation::Write, {
            let ((_file_guard, buf), result) =
                io_engine::get().write_at(file_guard, offset, buf).await;
-
+            if let Ok(size) = result {
+                ctx.io_size_metrics().write.add(size.into_u64());
+            }
            (buf, result)
        })
    }
@@ -1250,7 +1263,9 @@ impl Drop for VirtualFileInner {
                // there is also operation "close-by-replace" for closes done on eviction for
                // comparison.
                if let Some(fd) = slot_guard.file.take() {
-                    drop(fd);
+                    STORAGE_IO_TIME_METRIC
+                        .get(StorageIoOperation::Close)
+                        .observe_closure_duration(|| drop(fd));
                }
            }
        }
@@ -1319,6 +1334,7 @@ pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode, sync_mode: Syn
    set_io_mode(mode);
    io_engine::init(engine);
    SYNC_MODE.store(sync_mode as u8, std::sync::atomic::Ordering::Relaxed);
+    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }

 const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -54,7 +54,14 @@ static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8);
 pub(crate) fn set(engine_kind: IoEngineKind) {
    let engine: IoEngine = engine_kind.into();
    IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed);
-    
+    #[cfg(not(test))]
+    {
+        let metric = &crate::metrics::virtual_file_io_engine::KIND;
+        metric.reset();
+        metric
+            .with_label_values(&[&format!("{engine_kind}")])
+            .set(1);
+    }
 }

 #[cfg(not(test))]
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -82,7 +82,7 @@ pub async fn thread_local_system() -> Handle {
                    match res {
                        Ok(system) => {
                            info!("successfully launched system");
-                          
+                            metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
                            Ok(system)
                        }
                        Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
@@ -90,7 +90,7 @@ pub async fn thread_local_system() -> Handle {
                            info_span!("stats").in_scope(|| {
                                emit_launch_failure_process_stats();
                            });
-                            
+                            metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
                            metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id);
                            Err(())
                        }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -46,6 +46,7 @@ use wal_decoder::models::*;

 use crate::ZERO_PAGE;
 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::{DatadirModification, Version};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::{PageReconstructError, Timeline};
@@ -234,7 +235,7 @@ impl WalIngest {
        modification: &mut DatadirModification<'_>,
        ctx: &RequestContext,
    ) -> Result<bool, WalIngestError> {
-      
+        WAL_INGEST.records_received.inc();
        let prev_len = modification.len();

        modification.set_lsn(interpreted.next_record_lsn)?;
@@ -1442,6 +1443,9 @@ impl WalIngest {
                gap_blocks_filled += 1;
            }

+            WAL_INGEST
+                .gap_blocks_zeroed_on_rel_extend
+                .inc_by(gap_blocks_filled);

            // Log something when relation extends cause use to fill gaps
            // with zero pages. Logging is rate limited per pg version to
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -40,7 +40,10 @@ use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;

 use crate::config::PageServerConf;
-
+use crate::metrics::{
+    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
+    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
+};

 /// The real implementation that uses a Postgres process to
 /// perform WAL replay.
@@ -350,7 +353,7 @@ impl PostgresRedoManager {
                }
            },
            Err(permit) => {
-                
+                let start = Instant::now();
                // acquire guard before spawning process, so that we don't spawn new processes
                // if the gate is already closed.
                let _launched_processes_guard = match self.launched_processes.enter() {
@@ -368,9 +371,13 @@ impl PostgresRedoManager {
                    .context("launch walredo process")?,
                    _launched_processes_guard,
                });
-                
-               
-                
+                let duration = start.elapsed();
+                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                info!(
+                    elapsed_ms = duration.as_millis(),
+                    pid = proc.id(),
+                    "launched walredo process"
+                );
                self.redo_process
                    .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
                proc
@@ -464,7 +471,9 @@ impl PostgresRedoManager {
                        }
                });

-               
+                WAL_REDO_TIME.observe(duration.as_secs_f64());
+                WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+                WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);

                debug!(
                    "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
@@ -529,7 +538,9 @@ impl PostgresRedoManager {
        }
        // Success!
        let duration = start_time.elapsed();
-       
+        // FIXME: using the same metric here creates a bimodal distribution by default, and because
+        // there could be multiple batch sizes this would be N+1 modal.
+        WAL_REDO_TIME.observe(duration.as_secs_f64());

        debug!(
            "neon applied {} WAL records in {} us to reconstruct page image at LSN {}",
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -21,7 +21,7 @@ use utils::poison::Poison;

 use self::no_leak_child::NoLeakChild;
 use crate::config::PageServerConf;
-use crate::metrics:: WalRedoKillCause;
+use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER, WalRedoKillCause};
 use crate::page_cache::PAGE_SZ;
 use crate::span::debug_assert_current_span_has_tenant_id;

@@ -97,6 +97,7 @@ impl WalRedoProcess {
            //    walredo request.
            .spawn_no_leak_child(tenant_shard_id)
            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
        let mut child = scopeguard::guard(child, |child| {
            error!("killing wal-redo-postgres process due to a problem during launch");
            child.kill_and_wait(WalRedoKillCause::Startup);
@@ -117,7 +118,12 @@ impl WalRedoProcess {

        tokio::spawn(
            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();

                use tokio::io::AsyncBufReadExt;
                let mut stderr_lines = tokio::io::BufReader::new(stderr);
@@ -225,7 +231,7 @@ impl WalRedoProcess {
            }
        }
        protocol::build_get_page_msg(tag, &mut writebuf);
-        
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);

        let Ok(res) =
            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
--- a/pageserver/src/walredo/process/no_leak_child.rs
+++ b/pageserver/src/walredo/process/no_leak_child.rs
@@ -5,7 +5,7 @@ use std::process::{Child, Command};
 use pageserver_api::shard::TenantShardId;
 use tracing::{error, info, instrument};

-use crate::metrics::WalRedoKillCause;
+use crate::metrics::{WAL_REDO_PROCESS_COUNTERS, WalRedoKillCause};

 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
@@ -47,6 +47,9 @@ impl NoLeakChild {

    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
    pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
        let res = child.kill();
        if let Err(e) = res {
            // This branch is very unlikely because:
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -95,7 +95,7 @@ static uint32 local_request_counter;
 * Various settings related to prompt (fast) handling of PageStream responses
 * at any CHECK_FOR_INTERRUPTS point.
 */
-int				readahead_getpage_pull_timeout_ms = 50;
+int				readahead_getpage_pull_timeout_ms = 0;
 static int		PS_TIMEOUT_ID = 0;
 static bool		timeout_set = false;
 static bool		timeout_signaled = false;
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -75,7 +75,7 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-int         neon_protocol_version = 3;
+int         neon_protocol_version = 2;

 static int	neon_compute_mode = 0;
 static int	max_reconnect_attempts = 60;
@@ -1362,7 +1362,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   NULL, NULL, NULL);
+							   check_neon_id, NULL, NULL);
 	DefineCustomStringVariable("neon.branch_id",
 							   "Neon branch_id the server is running on",
 							   NULL,
@@ -1370,7 +1370,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   NULL, NULL, NULL);
+							   check_neon_id, NULL, NULL);
 	DefineCustomStringVariable("neon.endpoint_id",
 							   "Neon endpoint_id the server is running on",
 							   NULL,
@@ -1378,7 +1378,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_POSTMASTER,
 							   0,	/* no flags required */
-							   NULL, NULL, NULL);
+							   check_neon_id, NULL, NULL);

 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
@@ -1432,7 +1432,7 @@ pg_init_libpagestore(void)
 							"PageStream connection when we have pages which "
 							"were read ahead but not yet received.",
 							&readahead_getpage_pull_timeout_ms,
-							50, 0, 5 * 60 * 1000,
+							0, 0, 5 * 60 * 1000,
 							PGC_USERSET,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
@@ -1440,7 +1440,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							3,	/* use protocol version 3 */
+							2,	/* use protocol version 2 */
 							2,	/* min */
 							3,	/* max */
 							PGC_SU_BACKEND,
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -2040,7 +2040,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 /*
 * neon_end_unlogged_build() -- Finish an unlogged rel build.
 *
- * Call this after you have finished WAL-logging a relation that was
+ * Call this after you have finished WAL-logging an relation that was
 * first populated without WAL-logging.
 *
 * This removes the local copy of the rel, since it's now been fully
@@ -2059,35 +2059,14 @@ neon_end_unlogged_build(SMgrRelation reln)

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
-		XLogRecPtr recptr;
-		BlockNumber nblocks;
-
 		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
 		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

-		/*
-		 * Update the last-written LSN cache.
-		 *
-		 * The relation is still on local disk so we can get the size by
-		 * calling mdnblocks() directly. For the LSN, GetXLogInsertRecPtr() is
-		 * very conservative. If we could assume that this function is called
-		 * from the same backend that WAL-logged the contents, we could use
-		 * XactLastRecEnd here. But better safe than sorry.
-		 */
-		nblocks = mdnblocks(reln, MAIN_FORKNUM);
-		recptr = GetXLogInsertRecPtr();
-
-		neon_set_lwlsn_block_range(recptr,
-								   InfoFromNInfoB(rinfob),
-								   MAIN_FORKNUM, 0, nblocks);
-		neon_set_lwlsn_relation(recptr,
-								InfoFromNInfoB(rinfob),
-								MAIN_FORKNUM);
-
 		/* Make the relation look permanent again */
 		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;

 		/* Remove local copy */
+		rinfob = InfoBFromSMgrRel(reln);
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
 			neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -890,7 +890,7 @@ libpqwp_connect_start(char *conninfo)
 	 * palloc will exit on failure though, so there's not much we could do if
 	 * it *did* fail.
 	 */
-	conn = (WalProposerConn*)MemoryContextAllocZero(TopMemoryContext, sizeof(WalProposerConn));
+	conn = palloc(sizeof(WalProposerConn));
 	conn->pg_conn = pg_conn;
 	conn->is_nonblocking = false;	/* connections always start in blocking
 									 * mode */
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -947,6 +947,8 @@ class NeonEnvBuilder:
                    continue
                if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name):
                    continue
+                if FINAL_METRICS_FILE_NAME == test_file.name:
+                    continue
                log.debug(f"Removing large database {test_file} file")
                test_file.unlink()
            elif test_entry.is_dir():
@@ -2987,7 +2989,7 @@ class NeonPageserver(PgProtocol, LogUtils):
            return

        metrics = self.http_client().get_metrics_str()
-        metrics_snapshot_path = self.workdir / "final_metrics.txt"
+        metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME

        with open(metrics_snapshot_path, "w") as f:
            f.write(metrics)
@@ -5154,6 +5156,9 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
    r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )

+FINAL_METRICS_FILE_NAME: str = "final_metrics.txt"
+
+
 SKIP_DIRS = frozenset(
    (
        "pg_wal",
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -258,7 +258,7 @@ def get_scale_for_db(size_mb: int) -> int:


 ATTACHMENT_NAME_REGEX: re.Pattern[str] = re.compile(
-    r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)|final_metrics.txt"
+    r"regression\.(diffs|out)|.+\.(?:log|stderr|stdout|filediff|metrics|html|walredo)"
 )


--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -29,7 +29,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):

    log.info("wait for all tenants to become active")
    wait_until_all_tenants_state(
-        ps_http, "Active", iterations=10 + n_tenants, period=10, http_error_ok=False
+        ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False
    )

    # ensure all layers are resident for predictiable performance
@@ -40,7 +40,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
            for layer in info.historic_layers:
                assert not layer.remote

-    env.storage_controller.reconcile_until_idle(timeout_secs=160)
+    env.storage_controller.reconcile_until_idle(timeout_secs=60)

    log.info("ready")

--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -155,7 +155,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "compaction_algorithm": {
            "kind": "tiered",
        },
-        "compaction_shard_ancestor": False,
        "eviction_policy": {
            "kind": "LayerAccessThreshold",
            "period": "20s",
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -199,8 +199,6 @@ def test_pageserver_gc_compaction_preempt(
    conf = PREEMPT_GC_COMPACTION_TENANT_CONF.copy()
    env = neon_env_builder.init_start(initial_tenant_conf=conf)

-    env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*")
-
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    NeonPageserver,
    StorageControllerMigrationConfig,
-    flush_ep_to_pageserver,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
@@ -998,6 +997,10 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
    heatmap_after_migration = timeline_heatmap(timeline_id)

+    local_layers = ps_secondary.list_layers(tenant_id, timeline_id)
+    # We download 1 layer per second and give up within 5 seconds.
+    assert len(local_layers) < 10
+
    after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"])
    log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")

@@ -1035,13 +1038,8 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
        .value
    )

-    assert before == after
-
-    # Stop the endpoint and wait until any finally written WAL propagates to
-    # the pageserver and is uploaded to remote storage.
-    flush_ep_to_pageserver(env, workload.endpoint(), tenant_id, timeline_id)
-    ps_secondary.http_client().timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
    workload.stop()
+    assert before == after

    # Now simulate the case where a child timeline is archived, parent layers
    # are evicted and the child is unarchived. When the child is unarchived,
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17.4",
-    "eab3a37834cac6ec0719bf817ac918a201712d66"
+    "66114c23bc61205b0e3fb1e77ee76a4abc1eb4b8"
  ],
  "v16": [
    "16.8",
-    "37496f87b5324af53c56127e278ee5b1e8435253"
+    "d56e79cd5d6136c159b1d8d98acb7981d4b69364"
  ],
  "v15": [
    "15.12",
-    "8ecb12f21d862dfa39f7204b8f5e1c00a2a225b3"
+    "aeb292eeace9072e07071254b6ffc7a74007d4d2"
  ],
  "v14": [
    "14.17",
-    "d3c9d61fb7a362a165dac7060819dd9d6ad68c28"
+    "a0391901a2af13aa029b905272a5b2024133c926"
  ]
 }
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -70,6 +70,7 @@ num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 p256 = { version = "0.13", features = ["jwk"] }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
+pkcs8 = { version = "0.10", default-features = false, features = ["pem", "std"] }
 prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
Author	SHA1	Message	Date
Konstantin Knizhnik	1c4bed27be	Resolve merge conflicts	2025-04-16 08:20:46 +03:00
Konstantin Knizhnik	fdf0f1bdc0	Fix rust formatting	2025-04-16 07:49:47 +03:00
Konstantin Knizhnik	0bdd388dd8	Make it possible to control lazy_sru_download through tenant config	2025-04-16 07:49:47 +03:00
Konstantin Knizhnik	712b4cf83c	Update compute_tools/src/compute.rs Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2025-04-16 07:49:46 +03:00
Konstantin Knizhnik	15b6bb5026	Update libs/compute_api/src/spec.rs Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2025-04-16 07:49:44 +03:00
Konstantin Knizhnik	61d642e541	Update pageserver/src/page_service.rs Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2025-04-16 07:48:59 +03:00
Konstantin Knizhnik	1d24b887b8	Refector construction of basebackup command	2025-04-16 07:48:58 +03:00
Konstantin Knizhnik	955175c791	Make clippy happy	2025-04-16 07:48:58 +03:00
Konstantin Knizhnik	5fb0bcdd6a	Make clippy happy	2025-04-16 07:48:58 +03:00
Konstantin Knizhnik	f146fa86f8	Use lazy SLRU download for all timelines is feature flag is set	2025-04-16 07:48:56 +03:00
Konstantin Knizhnik	961008116b	Use lazy SLRU download for all timelines is feature flag is set	2025-04-16 07:47:55 +03:00
Konstantin Knizhnik	42d2d3addc	Fix checking lazy SLRU download condition	2025-04-16 07:45:35 +03:00
Konstantin Knizhnik	06d0bed566	Always update lazy_slru_download flag during basebackup	2025-04-16 07:45:35 +03:00
Konstantin Knizhnik	aa367e5d82	Add lazy_slru_download_threshold parameter to page server config	2025-04-16 07:45:33 +03:00
Konstantin Knizhnik	6b76e1c526	Add lazy_slru_download compute feature flag	2025-04-16 07:42:32 +03:00