implement our own sharding for PageCache (16 HashMaps based on lower 4 bits of blkno)

use scc::HashMap instead of DashMap
replace HashMap in PageCache with concurrent, sharded DashMap
2026-05-14 11:40:38 +00:00 · 2025-04-24 15:16:26 +02:00 · 2025-04-24 14:59:02 +02:00 · 2025-04-24 10:57:32 +02:00 · 2025-04-23 17:14:29 +00:00 · 2025-04-23 16:31:04 +00:00
155 changed files with 5446 additions and 7653 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -6,6 +6,7 @@ self-hosted-runner:
    - small
    - small-metal
    - small-arm64
+    - unit-perf
    - us-east-2
 config-variables:
  - AWS_ECR_REGION
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -70,6 +70,7 @@ runs:

    - name: Install Allure
      shell: bash -euxo pipefail {0}
+      working-directory: /tmp
      run: |
        if ! which allure; then
          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
--- a/.github/workflows/_create-release-pr.yml
+++ b/.github/workflows/_create-release-pr.yml
@@ -53,10 +53,13 @@ jobs:
            || inputs.component-name == 'Compute' && 'release-compute'
          }}
      run: |
-        today=$(date +'%Y-%m-%d')
-        echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT}
-        echo "rc-branch=rc/${RELEASE_BRANCH}/${today}"  | tee -a ${GITHUB_OUTPUT}
-        echo "release-branch=${RELEASE_BRANCH}"         | tee -a ${GITHUB_OUTPUT}
+        now_date=$(date -u +'%Y-%m-%d')
+        now_time=$(date -u +'%H-%M-%Z')
+        {
+          echo "title=${COMPONENT_NAME} release ${now_date}"
+          echo "rc-branch=rc/${RELEASE_BRANCH}/${now_date}_${now_time}"
+          echo "release-branch=${RELEASE_BRANCH}"
+        } | tee -a ${GITHUB_OUTPUT}

    - name: Configure git
      run: |
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -284,7 +284,7 @@ jobs:
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, small-metal ]
+    runs-on: [ self-hosted, unit-perf ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
@@ -1271,7 +1271,7 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, trigger-custom-extensions-build-and-wait ]
    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
    if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
    permissions:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1416,6 +1416,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "base64 0.13.1",
 "camino",
 "clap",
 "comfy-table",
@@ -1425,10 +1426,12 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
+ "jsonwebtoken",
 "nix 0.27.1",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
+ "pem",
 "postgres_backend",
 "postgres_connection",
 "regex",
@@ -1437,6 +1440,8 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
+ "sha2",
+ "spki 0.7.3",
 "storage_broker",
 "thiserror 1.0.69",
 "tokio",
@@ -2817,6 +2822,7 @@ dependencies = [
 "hyper 0.14.30",
 "itertools 0.10.5",
 "jemalloc_pprof",
+ "jsonwebtoken",
 "metrics",
 "once_cell",
 "pprof",
@@ -2837,6 +2843,7 @@ dependencies = [
 "utils",
 "uuid",
 "workspace_hack",
+ "x509-cert",
 ]

 [[package]]
@@ -4268,6 +4275,7 @@ dependencies = [
 "hyper 0.14.30",
 "indoc",
 "itertools 0.10.5",
+ "jsonwebtoken",
 "md5",
 "metrics",
 "nix 0.27.1",
@@ -5684,9 +5692,9 @@ dependencies = [

 [[package]]
 name = "ring"
-version = "0.17.13"
+version = "0.17.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
 dependencies = [
 "cc",
 "cfg-if",
@@ -5987,6 +5995,7 @@ dependencies = [
 "humantime",
 "hyper 0.14.30",
 "itertools 0.10.5",
+ "jsonwebtoken",
 "metrics",
 "once_cell",
 "pageserver_api",
@@ -7871,6 +7880,7 @@ dependencies = [
 "metrics",
 "nix 0.27.1",
 "once_cell",
+ "pem",
 "pin-project-lite",
 "postgres_connection",
 "pprof",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -141,6 +141,7 @@ parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
+pem = "3.0.3"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
@@ -174,6 +175,7 @@ signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
+spki = "0.7.3"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
--- a/compute/patches/pgvector.patch
+++ b/compute/patches/pgvector.patch
@@ -15,7 +15,7 @@ index 7a4b88c..56678af 100644
 HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
 
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index b667478..dc95d89 100644
+index b667478..1298aa1 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
@@ -36,7 +36,7 @@ index b667478..dc95d89 100644
 	/* Close relations within worker */
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
-@@ -1100,12 +1108,39 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1100,13 +1108,25 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
 	SeedRandom(42);
 #endif
 
@@ -48,32 +48,17 @@ index b667478..dc95d89 100644
 
 	BuildGraph(buildstate, forkNum);
 
-	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
 +#ifdef NEON_SMGR
 +	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
-+	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
+ 	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
-+#ifdef NEON_SMGR
-+		{
-+#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+			if (set_lwlsn_block_range_hook)
-+				set_lwlsn_block_range_hook(XactLastRecEnd, rlocator,
-+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+			if (set_lwlsn_relation_hook)
-+				set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+		}
-+#endif
-+	}
-+
+ 
 +#ifdef NEON_SMGR
 +	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
- 
+
 	FreeBuildState(buildstate);
 }
+ 
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
@@ -1,5 +1,5 @@
 diff --git a/src/ruminsert.c b/src/ruminsert.c
-index 255e616..7a2240f 100644
+index 255e616..1c6edb7 100644
 --- a/src/ruminsert.c
 +++ b/src/ruminsert.c
@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
@@ -24,24 +24,12 @@ index 255e616..7a2240f 100644
 	/*
 	 * Write index to xlog
 	 */
-@@ -713,6 +721,22 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
+@@ -713,6 +721,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
 		UnlockReleaseBuffer(buffer);
 	}
 
 +#ifdef NEON_SMGR
-+	{
-+#if PG_VERSION_NUM >= 160000
-+		RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+		RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+		if (set_lwlsn_block_range_hook)
-+			set_lwlsn_block_range_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+		if (set_lwlsn_relation_hook)
-+			set_lwlsn_relation_hook(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+
-+		smgr_end_unlogged_build(index->rd_smgr);
-+	}
+	smgr_end_unlogged_build(index->rd_smgr);
 +#endif
 +
 	/*
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -29,13 +29,12 @@
 //! ```sh
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
-//!             -S /var/db/postgres/specs/current.json \
+//!             -c /var/db/postgres/configs/config.json \
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::ffi::OsString;
 use std::fs::File;
-use std::path::Path;
 use std::process::exit;
 use std::sync::mpsc;
 use std::thread;
@@ -43,8 +42,7 @@ use std::time::Duration;

 use anyhow::{Context, Result};
 use clap::Parser;
-use compute_api::responses::ComputeCtlConfig;
-use compute_api::spec::ComputeSpec;
+use compute_api::responses::ComputeConfig;
 use compute_tools::compute::{
    BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
 };
@@ -118,8 +116,10 @@ struct Cli {
    #[arg(long)]
    pub set_disk_quota_for_fs: Option<String>,

-    #[arg(short = 'S', long, group = "spec-path")]
-    pub spec_path: Option<OsString>,
+    // TODO(tristan957): remove alias after compatibility tests are no longer
+    // an issue
+    #[arg(short = 'c', long, alias = "spec-path")]
+    pub config: Option<OsString>,

    #[arg(short = 'i', long, group = "compute-id")]
    pub compute_id: String,
@@ -127,8 +127,9 @@ struct Cli {
    #[arg(
        short = 'p',
        long,
-        conflicts_with = "spec-path",
-        value_name = "CONTROL_PLANE_API_BASE_URL"
+        conflicts_with = "config",
+        value_name = "CONTROL_PLANE_API_BASE_URL",
+        requires = "compute-id"
    )]
    pub control_plane_uri: Option<String>,
 }
@@ -138,7 +139,7 @@ fn main() -> Result<()> {

    let scenario = failpoint_support::init();

-    // For historical reasons, the main thread that processes the spec and launches postgres
+    // For historical reasons, the main thread that processes the config and launches postgres
    // is synchronous, but we always have this tokio runtime available and we "enter" it so
    // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
    // from all parts of compute_ctl.
@@ -154,7 +155,7 @@ fn main() -> Result<()> {

    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;

-    let cli_spec = try_spec_from_cli(&cli)?;
+    let config = get_config(&cli)?;

    let compute_node = ComputeNode::new(
        ComputeNodeParams {
@@ -175,8 +176,7 @@ fn main() -> Result<()> {
            #[cfg(target_os = "linux")]
            vm_monitor_addr: cli.vm_monitor_addr,
        },
-        cli_spec.spec,
-        cli_spec.compute_ctl_config,
+        config,
    )?;

    let exit_code = compute_node.run()?;
@@ -201,27 +201,17 @@ async fn init() -> Result<()> {
    Ok(())
 }

-fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
-    // First, read spec from the path if provided
-    if let Some(ref spec_path) = cli.spec_path {
-        let file = File::open(Path::new(spec_path))?;
-        return Ok(CliSpecParams {
-            spec: Some(serde_json::from_reader(file)?),
-            compute_ctl_config: ComputeCtlConfig::default(),
-        });
+fn get_config(cli: &Cli) -> Result<ComputeConfig> {
+    // First, read the config from the path if provided
+    if let Some(ref config) = cli.config {
+        let file = File::open(config)?;
+        return Ok(serde_json::from_reader(&file)?);
    }

-    if cli.control_plane_uri.is_none() {
-        panic!("must specify --control-plane-uri");
-    };
-
-    // If the spec wasn't provided in the CLI arguments, then retrieve it from
+    // If the config wasn't provided in the CLI arguments, then retrieve it from
    // the control plane
-    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
-        Ok(resp) => Ok(CliSpecParams {
-            spec: resp.0,
-            compute_ctl_config: resp.1,
-        }),
+    match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
+        Ok(config) => Ok(config),
        Err(e) => {
            error!(
                "cannot get response from control plane: {}\n\
@@ -233,13 +223,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
    }
 }

-struct CliSpecParams {
-    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
-    spec: Option<ComputeSpec>,
-    #[allow(dead_code)]
-    compute_ctl_config: ComputeCtlConfig,
-}
-
 fn deinit_and_exit(exit_code: Option<i32>) -> ! {
    // Shutdown trace pipeline gracefully, so that it has a chance to send any
    // pending traces before we exit. Shutting down OTEL tracing provider may
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,7 +11,7 @@ use std::{env, fs};
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
+use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
 use compute_api::spec::{
    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
 };
@@ -303,11 +303,7 @@ struct StartVmMonitorResult {
 }

 impl ComputeNode {
-    pub fn new(
-        params: ComputeNodeParams,
-        cli_spec: Option<ComputeSpec>,
-        compute_ctl_config: ComputeCtlConfig,
-    ) -> Result<Self> {
+    pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
        let connstr = params.connstr.as_str();
        let conn_conf = postgres::config::Config::from_str(connstr)
            .context("cannot build postgres config from connstr")?;
@@ -315,8 +311,8 @@ impl ComputeNode {
            .context("cannot build tokio postgres config from connstr")?;

        let mut new_state = ComputeState::new();
-        if let Some(cli_spec) = cli_spec {
-            let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        if let Some(spec) = config.spec {
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
            new_state.pspec = Some(pspec);
        }

@@ -327,7 +323,7 @@ impl ComputeNode {
            state: Mutex::new(new_state),
            state_changed: Condvar::new(),
            ext_download_progress: RwLock::new(HashMap::new()),
-            compute_ctl_config,
+            compute_ctl_config: config.compute_ctl_config,
        })
    }

@@ -523,11 +519,14 @@ impl ComputeNode {

        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
            pspec.tenant_id,
            pspec.timeline_id,
+            pspec.spec.project_id.as_deref().unwrap_or("None"),
+            pspec.spec.branch_id.as_deref().unwrap_or("None"),
+            pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
            pspec.spec.features,
            pspec.spec.remote_extensions,
        );
@@ -631,19 +630,23 @@ impl ComputeNode {
            });
        }

-        // Configure and start rsyslog for HIPAA if necessary
-        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
-            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-            if remote_endpoint.is_empty() {
-                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+        // Configure and start rsyslog for compliance audit logging
+        match pspec.spec.audit_log_level {
+            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
+                let remote_endpoint =
+                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+                if remote_endpoint.is_empty() {
+                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+                }
+
+                let log_directory_path = Path::new(&self.params.pgdata).join("log");
+                let log_directory_path = log_directory_path.to_string_lossy().to_string();
+                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Launch a background task to clean up the audit logs
+                launch_pgaudit_gc(log_directory_path);
            }
-
-            let log_directory_path = Path::new(&self.params.pgdata).join("log");
-            let log_directory_path = log_directory_path.to_string_lossy().to_string();
-            configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
-
-            // Launch a background task to clean up the audit logs
-            launch_pgaudit_gc(log_directory_path);
+            _ => {}
        }

        // Configure and start rsyslog for Postgres logs export
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
            escape_conf_value(&s.to_string())
        )?;
    }
+    if let Some(s) = &spec.project_id {
+        writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.branch_id {
+        writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.endpoint_id {
+        writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
+    }

    // tls
    if let Some(tls_config) = tls_config {
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
    // and don't allow the user or the control plane admin to change them.
    match spec.audit_log_level {
        ComputeAudit::Disabled => {}
-        ComputeAudit::Log => {
+        ComputeAudit::Log | ComputeAudit::Base => {
            writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
            writeln!(file, "pgaudit.log='ddl,role'")?;
            // Disable logging of catalog queries to reduce the noise
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
            }
            writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
        }
-        ComputeAudit::Hipaa => {
+        ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
            writeln!(
                file,
                "# Managed by compute_ctl compliance audit settings: begin"
            )?;
-            // This log level is very verbose
-            // but this is necessary for HIPAA compliance.
-            // Exclude 'misc' category, because it doesn't contain anythig relevant.
-            writeln!(file, "pgaudit.log='all, -misc'")?;
-            writeln!(file, "pgaudit.log_parameter=on")?;
+            // Enable logging of parameters.
+            // This is very verbose and may contain sensitive data.
+            if spec.audit_log_level == ComputeAudit::Full {
+                writeln!(file, "pgaudit.log_parameter=on")?;
+                writeln!(file, "pgaudit.log='all'")?;
+            } else {
+                writeln!(file, "pgaudit.log_parameter=off")?;
+                writeln!(file, "pgaudit.log='all, -misc'")?;
+            }
            // Disable logging of catalog queries
            // The catalog doesn't contain sensitive data, so we don't need to audit it.
            writeln!(file, "pgaudit.log_catalog=off")?;
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -1,7 +1,7 @@
-use std::{collections::HashSet, net::SocketAddr};
+use std::collections::HashSet;

 use anyhow::{Result, anyhow};
-use axum::{RequestExt, body::Body, extract::ConnectInfo};
+use axum::{RequestExt, body::Body};
 use axum_extra::{
    TypedHeader,
    headers::{Authorization, authorization::Bearer},
@@ -11,7 +11,7 @@ use futures::future::BoxFuture;
 use http::{Request, Response, StatusCode};
 use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
 use tower_http::auth::AsyncAuthorizeRequest;
-use tracing::warn;
+use tracing::{debug, warn};

 use crate::http::{JsonResponse, extract::RequestId};

@@ -54,8 +54,8 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
        Box::pin(async move {
            let request_id = request.extract_parts::<RequestId>().await.unwrap();

-            // TODO: Remove this stanza after teaching neon_local and the
-            // regression tests to use a JWT + JWKS.
+            // TODO(tristan957): Remove this stanza after teaching neon_local
+            // and the regression tests to use a JWT + JWKS.
            //
            // https://github.com/neondatabase/neon/issues/11316
            if cfg!(feature = "testing") {
@@ -64,19 +64,6 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
                return Ok(request);
            }

-            let connect_info = request
-                .extract_parts::<ConnectInfo<SocketAddr>>()
-                .await
-                .unwrap();
-
-            // In the event the request is coming from the loopback interface,
-            // allow all requests
-            if connect_info.ip().is_loopback() {
-                warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
-
-                return Ok(request);
-            }
-
            let TypedHeader(Authorization(bearer)) = request
                .extract_parts::<TypedHeader<Authorization<Bearer>>>()
                .await
@@ -92,7 +79,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
            if data.claims.compute_id != compute_id {
                return Err(JsonResponse::error(
                    StatusCode::UNAUTHORIZED,
-                    "invalid claims in authorization token",
+                    "invalid compute ID in authorization token claims",
                ));
            }

@@ -112,12 +99,16 @@ impl Authorize {
        token: &str,
        validation: &Validation,
    ) -> Result<TokenData<ComputeClaims>> {
+        debug_assert!(!jwks.keys.is_empty());
+
+        debug!("verifying token {}", token);
+
        for jwk in jwks.keys.iter() {
            let decoding_key = match DecodingKey::from_jwk(jwk) {
                Ok(key) => key,
                Err(e) => {
                    warn!(
-                        "Failed to construct decoding key from {}: {}",
+                        "failed to construct decoding key from {}: {}",
                        jwk.common.key_id.as_ref().unwrap(),
                        e
                    );
@@ -130,7 +121,7 @@ impl Authorize {
                Ok(data) => return Ok(data),
                Err(e) => {
                    warn!(
-                        "Failed to decode authorization token using {}: {}",
+                        "failed to decode authorization token using {}: {}",
                        jwk.common.key_id.as_ref().unwrap(),
                        e
                    );
@@ -140,6 +131,6 @@ impl Authorize {
            }
        }

-        Err(anyhow!("Failed to verify authorization token"))
+        Err(anyhow!("failed to verify authorization token"))
    }
 }
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
 // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
 // And it's fair to call it a 'RPC' (Remote Procedure Call).
 pub enum CPlaneRequestRPC {
-    GetSpec,
+    GetConfig,
 }

 impl CPlaneRequestRPC {
    pub fn as_str(&self) -> &str {
        match self {
-            CPlaneRequestRPC::GetSpec => "GetSpec",
+            CPlaneRequestRPC::GetConfig => "GetConfig",
        }
    }
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -3,9 +3,8 @@ use std::path::Path;

 use anyhow::{Result, anyhow, bail};
 use compute_api::responses::{
-    ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
+    ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
 };
-use compute_api::spec::ComputeSpec;
 use reqwest::StatusCode;
 use tokio_postgres::Client;
 use tracing::{error, info, instrument};
@@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5;
 fn do_control_plane_request(
    uri: &str,
    jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
+) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
        .header("Authorization", format!("Bearer {}", jwt))
@@ -29,14 +28,14 @@ fn do_control_plane_request(
        .map_err(|e| {
            (
                true,
-                format!("could not perform spec request to control plane: {:?}", e),
+                format!("could not perform request to control plane: {:?}", e),
                UNKNOWN_HTTP_STATUS.to_string(),
            )
        })?;

    let status = resp.status();
    match status {
-        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+        StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
            Ok(spec_resp) => Ok(spec_resp),
            Err(e) => Err((
                true,
@@ -69,40 +68,35 @@ fn do_control_plane_request(
    }
 }

-/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
-/// env variable is set, it will be used for authorization.
-pub fn get_spec_from_control_plane(
-    base_uri: &str,
-    compute_id: &str,
-) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
+/// Request config from the control-plane by compute_id. If
+/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
+/// authorization.
+pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
-    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
-        Ok(v) => v,
-        Err(_) => "".to_string(),
-    };
+    let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
    let mut attempt = 1;

-    info!("getting spec from control plane: {}", cp_uri);
+    info!("getting config from control plane: {}", cp_uri);

    // Do 3 attempts to get spec from the control plane using the following logic:
    // - network error -> then retry
    // - compute id is unknown or any other error -> bail out
    // - no spec for compute yet (Empty state) -> return Ok(None)
-    // - got spec -> return Ok(Some(spec))
+    // - got config -> return Ok(Some(config))
    while attempt < 4 {
        let result = match do_control_plane_request(&cp_uri, &jwt) {
-            Ok(spec_resp) => {
+            Ok(config_resp) => {
                CPLANE_REQUESTS_TOTAL
                    .with_label_values(&[
-                        CPlaneRequestRPC::GetSpec.as_str(),
+                        CPlaneRequestRPC::GetConfig.as_str(),
                        &StatusCode::OK.to_string(),
                    ])
                    .inc();
-                match spec_resp.status {
-                    ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
+                match config_resp.status {
+                    ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
                    ControlPlaneComputeStatus::Attached => {
-                        if let Some(spec) = spec_resp.spec {
-                            Ok((Some(spec), spec_resp.compute_ctl_config))
+                        if config_resp.spec.is_some() {
+                            Ok(config_resp.into())
                        } else {
                            bail!("compute is attached, but spec is empty")
                        }
@@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane(
            }
            Err((retry, msg, status)) => {
                CPLANE_REQUESTS_TOTAL
-                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
+                    .with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
                    .inc();
                if retry {
                    Err(anyhow!(msg))
@@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane(
        };

        if let Err(e) = &result {
-            error!("attempt {} to get spec failed with: {}", attempt, e);
+            error!("attempt {} to get config failed with: {}", attempt, e);
        } else {
            return result;
        }
@@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane(

    // All attempts failed, return error.
    Err(anyhow::anyhow!(
-        "Exhausted all attempts to retrieve the spec from the control plane"
+        "Exhausted all attempts to retrieve the config from the control plane"
    ))
 }

 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
    let pghba_path = pgdata_path.join("pg_hba.conf");

    if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
@@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {

 /// Create a standby.signal file
 pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
    let signalfile = pgdata_path.join("standby.signal");

    if !signalfile.exists() {
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -278,12 +278,12 @@ impl ComputeNode {
            // so that all config operations are audit logged.
            match spec.audit_log_level
            {
-                ComputeAudit::Hipaa => {
+                ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
                    phases.push(CreatePgauditExtension);
                    phases.push(CreatePgauditlogtofileExtension);
                    phases.push(DisablePostgresDBPgAudit);
                }
-                ComputeAudit::Log => {
+                ComputeAudit::Log | ComputeAudit::Base => {
                    phases.push(CreatePgauditExtension);
                    phases.push(DisablePostgresDBPgAudit);
                }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,13 +6,16 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+base64.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 humantime.workspace = true
+jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
+pem.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
 regex.workspace = true
@@ -20,6 +23,8 @@ reqwest = { workspace = true, features = ["blocking", "json"] }
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+sha2.workspace = true
+spki.workspace = true
 thiserror.workspace = true
 toml.workspace = true
 toml_edit.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -552,6 +552,7 @@ enum EndpointCmd {
    Start(EndpointStartCmdArgs),
    Reconfigure(EndpointReconfigureCmdArgs),
    Stop(EndpointStopCmdArgs),
+    GenerateJwt(EndpointGenerateJwtCmdArgs),
 }

 #[derive(clap::Args)]
@@ -699,6 +700,13 @@ struct EndpointStopCmdArgs {
    mode: String,
 }

+#[derive(clap::Args)]
+#[clap(about = "Generate a JWT for an endpoint")]
+struct EndpointGenerateJwtCmdArgs {
+    #[clap(help = "Postgres endpoint id")]
+    endpoint_id: String,
+}
+
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage neon_local branch name mappings")]
 enum MappingsCmd {
@@ -1528,6 +1536,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
            endpoint.stop(&args.mode, args.destroy)?;
        }
+        EndpointCmd::GenerateJwt(args) => {
+            let endpoint_id = &args.endpoint_id;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id)
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            let jwt = endpoint.generate_jwt()?;
+
+            println!("{jwt}");
+        }
    }

    Ok(())
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -29,7 +29,7 @@
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
 //!     postgresql.conf           - postgresql settings
-//!     spec.json                 - passed to `compute_ctl`
+//!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
 //!         zenith.signal
@@ -42,20 +42,30 @@ use std::path::PathBuf;
 use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+use std::time::{Duration, Instant};

 use anyhow::{Context, Result, anyhow, bail};
-use compute_api::requests::ConfigurationRequest;
-use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
+use compute_api::requests::{ComputeClaims, ConfigurationRequest};
+use compute_api::responses::{
+    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
+};
 use compute_api::spec::{
    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
    RemoteExtSpec, Role,
 };
+use jsonwebtoken::jwk::{
+    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
+    OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
+};
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
+use pem::Pem;
 use reqwest::header::CONTENT_TYPE;
 use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
+use sha2::{Digest, Sha256};
+use spki::der::Decode;
+use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -80,6 +90,7 @@ pub struct EndpointConf {
    drop_subscriptions_before_start: bool,
    features: Vec<ComputeFeature>,
    cluster: Option<Cluster>,
+    compute_ctl_config: ComputeCtlConfig,
 }

 //
@@ -135,6 +146,37 @@ impl ComputeControlPlane {
            .unwrap_or(self.base_port)
    }

+    /// Create a JSON Web Key Set. This ideally matches the way we create a JWKS
+    /// from the production control plane.
+    fn create_jwks_from_pem(pem: &Pem) -> Result<JwkSet> {
+        let spki: SubjectPublicKeyInfoRef = SubjectPublicKeyInfo::from_der(pem.contents())?;
+        let public_key = spki.subject_public_key.raw_bytes();
+
+        let mut hasher = Sha256::new();
+        hasher.update(public_key);
+        let key_hash = hasher.finalize();
+
+        Ok(JwkSet {
+            keys: vec![Jwk {
+                common: CommonParameters {
+                    public_key_use: Some(PublicKeyUse::Signature),
+                    key_operations: Some(vec![KeyOperations::Verify]),
+                    key_algorithm: Some(KeyAlgorithm::EdDSA),
+                    key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)),
+                    x509_url: None::<String>,
+                    x509_chain: None::<Vec<String>>,
+                    x509_sha1_fingerprint: None::<String>,
+                    x509_sha256_fingerprint: None::<String>,
+                },
+                algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
+                    key_type: OctetKeyPairType::OctetKeyPair,
+                    curve: EllipticCurve::Ed25519,
+                    x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
+                }),
+            }],
+        })
+    }
+
    #[allow(clippy::too_many_arguments)]
    pub fn new_endpoint(
        &mut self,
@@ -152,6 +194,10 @@ impl ComputeControlPlane {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
        let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
+        let compute_ctl_config = ComputeCtlConfig {
+            jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
+            tls: None::<TlsConfig>,
+        };
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
@@ -179,6 +225,7 @@ impl ComputeControlPlane {
            reconfigure_concurrency: 1,
            features: vec![],
            cluster: None,
+            compute_ctl_config: compute_ctl_config.clone(),
        });

        ep.create_endpoint_dir()?;
@@ -198,6 +245,7 @@ impl ComputeControlPlane {
                reconfigure_concurrency: 1,
                features: vec![],
                cluster: None,
+                compute_ctl_config,
            })?,
        )?;
        std::fs::write(
@@ -240,7 +288,6 @@ impl ComputeControlPlane {

 ///////////////////////////////////////////////////////////////////////////////

-#[derive(Debug)]
 pub struct Endpoint {
    /// used as the directory name
    endpoint_id: String,
@@ -269,6 +316,9 @@ pub struct Endpoint {
    features: Vec<ComputeFeature>,
    // Cluster settings
    cluster: Option<Cluster>,
+
+    /// The compute_ctl config for the endpoint's compute.
+    compute_ctl_config: ComputeCtlConfig,
 }

 #[derive(PartialEq, Eq)]
@@ -331,6 +381,7 @@ impl Endpoint {
            drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
            features: conf.features,
            cluster: conf.cluster,
+            compute_ctl_config: conf.compute_ctl_config,
        })
    }

@@ -578,6 +629,13 @@ impl Endpoint {
        Ok(safekeeper_connstrings)
    }

+    /// Generate a JWT with the correct claims.
+    pub fn generate_jwt(&self) -> Result<String> {
+        self.env.generate_auth_token(&ComputeClaims {
+            compute_id: self.endpoint_id.clone(),
+        })
+    }
+
    #[allow(clippy::too_many_arguments)]
    pub async fn start(
        &self,
@@ -619,87 +677,101 @@ impl Endpoint {
            remote_extensions = None;
        };

-        // Create spec file
-        let mut spec = ComputeSpec {
-            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
-            format_version: 1.0,
-            operation_uuid: None,
-            features: self.features.clone(),
-            swap_size_bytes: None,
-            disk_quota_bytes: None,
-            disable_lfc_resizing: None,
-            cluster: Cluster {
-                cluster_id: None, // project ID: not used
-                name: None,       // project name: not used
-                state: None,
-                roles: if create_test_user {
-                    vec![Role {
+        // Create config file
+        let config = {
+            let mut spec = ComputeSpec {
+                skip_pg_catalog_updates: self.skip_pg_catalog_updates,
+                format_version: 1.0,
+                operation_uuid: None,
+                features: self.features.clone(),
+                swap_size_bytes: None,
+                disk_quota_bytes: None,
+                disable_lfc_resizing: None,
+                cluster: Cluster {
+                    cluster_id: None, // project ID: not used
+                    name: None,       // project name: not used
+                    state: None,
+                    roles: if create_test_user {
+                        vec![Role {
+                            name: PgIdent::from_str("test").unwrap(),
+                            encrypted_password: None,
+                            options: None,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    databases: if create_test_user {
+                        vec![Database {
+                            name: PgIdent::from_str("neondb").unwrap(),
+                            owner: PgIdent::from_str("test").unwrap(),
+                            options: None,
+                            restrict_conn: false,
+                            invalid: false,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    settings: None,
+                    postgresql_conf: Some(postgresql_conf.clone()),
+                },
+                delta_operations: None,
+                tenant_id: Some(self.tenant_id),
+                timeline_id: Some(self.timeline_id),
+                project_id: None,
+                branch_id: None,
+                endpoint_id: Some(self.endpoint_id.clone()),
+                mode: self.mode,
+                pageserver_connstring: Some(pageserver_connstring),
+                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
+                safekeeper_connstrings,
+                storage_auth_token: auth_token.clone(),
+                remote_extensions,
+                pgbouncer_settings: None,
+                shard_stripe_size: Some(shard_stripe_size),
+                local_proxy_config: None,
+                reconfigure_concurrency: self.reconfigure_concurrency,
+                drop_subscriptions_before_start: self.drop_subscriptions_before_start,
+                audit_log_level: ComputeAudit::Disabled,
+                logs_export_host: None::<String>,
+            };
+
+            // this strange code is needed to support respec() in tests
+            if self.cluster.is_some() {
+                debug!("Cluster is already set in the endpoint spec, using it");
+                spec.cluster = self.cluster.clone().unwrap();
+
+                debug!("spec.cluster {:?}", spec.cluster);
+
+                // fill missing fields again
+                if create_test_user {
+                    spec.cluster.roles.push(Role {
                        name: PgIdent::from_str("test").unwrap(),
                        encrypted_password: None,
                        options: None,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                databases: if create_test_user {
-                    vec![Database {
+                    });
+                    spec.cluster.databases.push(Database {
                        name: PgIdent::from_str("neondb").unwrap(),
                        owner: PgIdent::from_str("test").unwrap(),
                        options: None,
                        restrict_conn: false,
                        invalid: false,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                settings: None,
-                postgresql_conf: Some(postgresql_conf.clone()),
-            },
-            delta_operations: None,
-            tenant_id: Some(self.tenant_id),
-            timeline_id: Some(self.timeline_id),
-            mode: self.mode,
-            pageserver_connstring: Some(pageserver_connstring),
-            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
-            safekeeper_connstrings,
-            storage_auth_token: auth_token.clone(),
-            remote_extensions,
-            pgbouncer_settings: None,
-            shard_stripe_size: Some(shard_stripe_size),
-            local_proxy_config: None,
-            reconfigure_concurrency: self.reconfigure_concurrency,
-            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
-            audit_log_level: ComputeAudit::Disabled,
-            logs_export_host: None::<String>,
+                    });
+                }
+                spec.cluster.postgresql_conf = Some(postgresql_conf);
+            }
+
+            ComputeConfig {
+                spec: Some(spec),
+                compute_ctl_config: self.compute_ctl_config.clone(),
+            }
        };

-        // this strange code is needed to support respec() in tests
-        if self.cluster.is_some() {
-            debug!("Cluster is already set in the endpoint spec, using it");
-            spec.cluster = self.cluster.clone().unwrap();
-
-            debug!("spec.cluster {:?}", spec.cluster);
-
-            // fill missing fields again
-            if create_test_user {
-                spec.cluster.roles.push(Role {
-                    name: PgIdent::from_str("test").unwrap(),
-                    encrypted_password: None,
-                    options: None,
-                });
-                spec.cluster.databases.push(Database {
-                    name: PgIdent::from_str("neondb").unwrap(),
-                    owner: PgIdent::from_str("test").unwrap(),
-                    options: None,
-                    restrict_conn: false,
-                    invalid: false,
-                });
-            }
-            spec.cluster.postgresql_conf = Some(postgresql_conf);
-        }
-
+        // TODO(tristan957): Remove the write to spec.json after compatibility
+        // tests work themselves out
        let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
+        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
+        let config_path = self.endpoint_path().join("config.json");
+        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;

        // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
        let logfile = std::fs::OpenOptions::new()
@@ -707,6 +779,16 @@ impl Endpoint {
            .append(true)
            .open(self.endpoint_path().join("compute.log"))?;

+        // TODO(tristan957): Remove when compatibility tests are no longer an
+        // issue
+        let old_compute_ctl = {
+            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
+            let help_output = cmd.arg("--help").output()?;
+            let help_output = String::from_utf8_lossy(&help_output.stdout);
+
+            !help_output.contains("--config")
+        };
+
        // Launch compute_ctl
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
@@ -725,9 +807,18 @@ impl Endpoint {
        ])
        .args(["--pgdata", self.pgdata().to_str().unwrap()])
        .args(["--connstr", &conn_str])
+        // TODO(tristan957): Change this to --config when compatibility tests
+        // are no longer an issue
        .args([
            "--spec-path",
-            self.endpoint_path().join("spec.json").to_str().unwrap(),
+            self.endpoint_path()
+                .join(if old_compute_ctl {
+                    "spec.json"
+                } else {
+                    "config.json"
+                })
+                .to_str()
+                .unwrap(),
        ])
        .args([
            "--pgbin",
@@ -739,16 +830,7 @@ impl Endpoint {
        ])
        // TODO: It would be nice if we generated compute IDs with the same
        // algorithm as the real control plane.
-        .args([
-            "--compute-id",
-            &format!(
-                "compute-{}",
-                SystemTime::now()
-                    .duration_since(UNIX_EPOCH)
-                    .unwrap()
-                    .as_secs()
-            ),
-        ])
+        .args(["--compute-id", &self.endpoint_id])
        .stdin(std::process::Stdio::null())
        .stderr(logfile.try_clone()?)
        .stdout(logfile);
@@ -846,6 +928,7 @@ impl Endpoint {
                    self.external_http_address.port()
                ),
            )
+            .bearer_auth(self.generate_jwt()?)
            .send()
            .await?;

@@ -870,10 +953,12 @@ impl Endpoint {
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
    ) -> Result<()> {
-        let mut spec: ComputeSpec = {
-            let spec_path = self.endpoint_path().join("spec.json");
-            let file = std::fs::File::open(spec_path)?;
-            serde_json::from_reader(file)?
+        let (mut spec, compute_ctl_config) = {
+            let config_path = self.endpoint_path().join("config.json");
+            let file = std::fs::File::open(config_path)?;
+            let config: ComputeConfig = serde_json::from_reader(file)?;
+
+            (config.spec.unwrap(), config.compute_ctl_config)
        };

        let postgresql_conf = self.read_postgresql_conf()?;
@@ -920,10 +1005,11 @@ impl Endpoint {
                self.external_http_address.port()
            ))
            .header(CONTENT_TYPE.as_str(), "application/json")
+            .bearer_auth(self.generate_jwt()?)
            .body(
                serde_json::to_string(&ConfigurationRequest {
                    spec,
-                    compute_ctl_config: ComputeCtlConfig::default(),
+                    compute_ctl_config,
                })
                .unwrap(),
            )
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -12,6 +12,7 @@ use std::{env, fs};

 use anyhow::{Context, bail};
 use clap::ValueEnum;
+use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
@@ -56,6 +57,7 @@ pub struct LocalEnv {

    // used to issue tokens during e.g pg start
    pub private_key_path: PathBuf,
+    /// Path to environment's public key
    pub public_key_path: PathBuf,

    pub broker: NeonBroker,
@@ -758,11 +760,11 @@ impl LocalEnv {

    // this function is used only for testing purposes in CLI e g generate tokens during init
    pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
-        let private_key_path = self.get_private_key_path();
-        let key_data = fs::read(private_key_path)?;
-        encode_from_key_file(claims, &key_data)
+        let key = self.read_private_key()?;
+        encode_from_key_file(claims, &key)
    }

+    /// Get the path to the private key.
    pub fn get_private_key_path(&self) -> PathBuf {
        if self.private_key_path.is_absolute() {
            self.private_key_path.to_path_buf()
@@ -771,6 +773,29 @@ impl LocalEnv {
        }
    }

+    /// Get the path to the public key.
+    pub fn get_public_key_path(&self) -> PathBuf {
+        if self.public_key_path.is_absolute() {
+            self.public_key_path.to_path_buf()
+        } else {
+            self.base_data_dir.join(&self.public_key_path)
+        }
+    }
+
+    /// Read the contents of the private key file.
+    pub fn read_private_key(&self) -> anyhow::Result<Pem> {
+        let private_key_path = self.get_private_key_path();
+        let pem = pem::parse(fs::read(private_key_path)?)?;
+        Ok(pem)
+    }
+
+    /// Read the contents of the public key file.
+    pub fn read_public_key(&self) -> anyhow::Result<Pem> {
+        let public_key_path = self.get_public_key_path();
+        let pem = pem::parse(fs::read(public_key_path)?)?;
+        Ok(pem)
+    }
+
    /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
    pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
        let base_path = base_path();
@@ -956,6 +981,7 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
            String::from_utf8_lossy(&keygen_output.stderr)
        );
    }
+
    // Extract the public key from the private key file
    //
    // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
@@ -972,6 +998,7 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
            String::from_utf8_lossy(&keygen_output.stderr)
        );
    }
+
    Ok(())
 }

@@ -980,7 +1007,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
    // -out rootCA.crt -keyout rootCA.key
    let keygen_output = Command::new("openssl")
        .args([
-            "req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
+            "req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
        ])
        .args(["-subj", "/CN=Neon Local CA"])
        .args(["-out", cert_path.to_str().unwrap()])
@@ -1010,7 +1037,7 @@ fn generate_ssl_cert(
    // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
    let keygen_output = Command::new("openssl")
        .args(["req", "-new", "-nodes"])
-        .args(["-newkey", "rsa:2048"])
+        .args(["-newkey", "ed25519"])
        .args(["-subj", "/CN=localhost"])
        .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
        .args(["-keyout", key_path.to_str().unwrap()])
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -413,6 +413,11 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("Failed to parse 'compaction_algorithm' json")?,
+            compaction_shard_ancestor: settings
+                .remove("compaction_shard_ancestor")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'compaction_shard_ancestor' as a bool")?,
            compaction_l0_first: settings
                .remove("compaction_l0_first")
                .map(|x| x.parse::<bool>())
@@ -535,6 +540,11 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'gc_compaction_enabled' as bool")?,
+            gc_compaction_verification: settings
+                .remove("gc_compaction_verification")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_verification' as bool")?,
            gc_compaction_initial_threshold_kb: settings
                .remove("gc_compaction_initial_threshold_kb")
                .map(|x| x.parse::<u64>())
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -13,9 +13,12 @@ use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
    TenantCreateResponse, TenantLocateResponse,
 };
-use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
+use pageserver_api::models::{
+    TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
+use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Certificate, Method};
 use serde::de::DeserializeOwned;
@@ -32,8 +35,8 @@ use crate::local_env::{LocalEnv, NeonStorageControllerConf};

 pub struct StorageController {
    env: LocalEnv,
-    private_key: Option<Vec<u8>>,
-    public_key: Option<String>,
+    private_key: Option<Pem>,
+    public_key: Option<Pem>,
    client: reqwest::Client,
    config: NeonStorageControllerConf,

@@ -82,7 +85,8 @@ impl NeonStorageControllerStopArgs {
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
    pub node_id: Option<NodeId>,
-    pub generation_override: Option<i32>,
+    pub generation_override: Option<i32>, // only new tenants
+    pub config: Option<TenantConfig>,     // only new tenants
 }

 #[derive(Serialize, Deserialize)]
@@ -113,7 +117,9 @@ impl StorageController {
            AuthType::Trust => (None, None),
            AuthType::NeonJWT => {
                let private_key_path = env.get_private_key_path();
-                let private_key = fs::read(private_key_path).expect("failed to read private key");
+                let private_key =
+                    pem::parse(fs::read(private_key_path).expect("failed to read private key"))
+                        .expect("failed to parse PEM file");

                // If pageserver auth is enabled, this implicitly enables auth for this service,
                // using the same credentials.
@@ -135,9 +141,13 @@ impl StorageController {
                        .expect("Empty key dir")
                        .expect("Error reading key dir");

-                    std::fs::read_to_string(dent.path()).expect("Can't read public key")
+                    pem::parse(std::fs::read_to_string(dent.path()).expect("Can't read public key"))
+                        .expect("Failed to parse PEM file")
                } else {
-                    std::fs::read_to_string(&public_key_path).expect("Can't read public key")
+                    pem::parse(
+                        std::fs::read_to_string(&public_key_path).expect("Can't read public key"),
+                    )
+                    .expect("Failed to parse PEM file")
                };
                (Some(private_key), Some(public_key))
            }
@@ -805,6 +815,7 @@ impl StorageController {
            tenant_shard_id,
            node_id: Some(pageserver_id),
            generation_override: None,
+            config: None,
        };

        let response = self
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -11,8 +11,8 @@ generate_id() {

 PG_VERSION=${PG_VERSION:-14}

-SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
-SPEC_FILE=/tmp/spec.json
+CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
+CONFIG_FILE=/tmp/config.json

 echo "Waiting pageserver become ready."
 while ! nc -z pageserver 6400; do
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
 done
 echo "Page server is ready."

-cp ${SPEC_FILE_ORG} ${SPEC_FILE}
+cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}

 if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
   tenant_id=${TENANT_ID}
@@ -73,17 +73,27 @@ else
  ulid_extension=ulid
 fi
 echo "Adding pgx_ulid"
-shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
-sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
+shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
+sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
 echo "Overwrite tenant id and timeline id in spec file"
-sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
-sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}

-cat ${SPEC_FILE}
+cat ${CONFIG_FILE}
+
+# TODO(tristan957): Remove these workarounds for backwards compatibility after
+# the next compute release. That includes these next few lines and the
+# --spec-path in the compute_ctl invocation.
+if compute_ctl --help | grep --quiet -- '--config'; then
+  SPEC_PATH="$CONFIG_FILE"
+else
+  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
+  SPEC_PATH=/tmp/spec.json
+fi

 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
     --compute-id "compute-$RANDOM"                          \
-     -S ${SPEC_FILE}
+     --spec-path "$SPEC_PATH"
--- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -0,0 +1,148 @@
+{
+    "spec": {
+        "format_version": 1.0,
+
+        "timestamp": "2022-10-12T18:00:00.000Z",
+        "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+        "cluster": {
+            "cluster_id": "docker_compose",
+            "name": "docker_compose_test",
+            "state": "restarted",
+            "roles": [
+                {
+                    "name": "cloud_admin",
+                    "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                    "options": null
+                }
+            ],
+            "databases": [
+            ],
+            "settings": [
+                {
+                    "name": "fsync",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "wal_level",
+                    "value": "logical",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "wal_log_hints",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "log_connections",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "port",
+                    "value": "55433",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "shared_buffers",
+                    "value": "1MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_connections",
+                    "value": "100",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "listen_addresses",
+                    "value": "0.0.0.0",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_wal_senders",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "max_replication_slots",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "wal_sender_timeout",
+                    "value": "5s",
+                    "vartype": "string"
+                },
+                {
+                    "name": "wal_keep_size",
+                    "value": "0",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "password_encryption",
+                    "value": "md5",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "restart_after_crash",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "synchronous_standby_names",
+                    "value": "walproposer",
+                    "vartype": "string"
+                },
+                {
+                    "name": "shared_preload_libraries",
+                    "value": "neon,pg_cron,timescaledb,pg_stat_statements",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.safekeepers",
+                    "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.timeline_id",
+                    "value": "TIMELINE_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.tenant_id",
+                    "value": "TENANT_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.pageserver_connstring",
+                    "value": "host=pageserver port=6400",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_write_lag",
+                    "value": "500MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_flush_lag",
+                    "value": "10GB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "cron.database",
+                    "value": "postgres",
+                    "vartype": "string"
+                }
+            ]
+        },
+
+        "delta_operations": [
+        ]
+    },
+    "compute_ctl_config": {
+        "jwks": {
+            "keys": []
+        }
+    }
+}
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -1,141 +0,0 @@
-{
-    "format_version": 1.0,
-
-    "timestamp": "2022-10-12T18:00:00.000Z",
-    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
-
-    "cluster": {
-        "cluster_id": "docker_compose",
-        "name": "docker_compose_test",
-        "state": "restarted",
-        "roles": [
-            {
-                "name": "cloud_admin",
-                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
-                "options": null
-            }
-        ],
-        "databases": [
-        ],
-        "settings": [
-            {
-                "name": "fsync",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_level",
-                "value": "logical",
-                "vartype": "enum"
-            },
-            {
-                "name": "wal_log_hints",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "log_connections",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "port",
-                "value": "55433",
-                "vartype": "integer"
-            },
-            {
-                "name": "shared_buffers",
-                "value": "1MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_connections",
-                "value": "100",
-                "vartype": "integer"
-            },
-            {
-                "name": "listen_addresses",
-                "value": "0.0.0.0",
-                "vartype": "string"
-            },
-            {
-                "name": "max_wal_senders",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_replication_slots",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "wal_sender_timeout",
-                "value": "5s",
-                "vartype": "string"
-            },
-            {
-                "name": "wal_keep_size",
-                "value": "0",
-                "vartype": "integer"
-            },
-            {
-                "name": "password_encryption",
-                "value": "md5",
-                "vartype": "enum"
-            },
-            {
-                "name": "restart_after_crash",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "synchronous_standby_names",
-                "value": "walproposer",
-                "vartype": "string"
-            },
-            {
-                "name": "shared_preload_libraries",
-                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.safekeepers",
-                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.timeline_id",
-                "value": "TIMELINE_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.tenant_id",
-                "value": "TENANT_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.pageserver_connstring",
-                "value": "host=pageserver port=6400",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_write_lag",
-                "value": "500MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_flush_lag",
-                "value": "10GB",
-                "vartype": "string"
-            },
-            {
-                "name": "cron.database",
-                "value": "postgres",
-                "vartype": "string"
-            }
-        ]
-    },
-
-    "delta_operations": [
-    ]
-}
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -159,7 +159,7 @@ services:
      #- RUST_BACKTRACE=1
    # Mount the test files directly, for faster editing cycle.
    volumes:
-      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
      - ./compute_wrapper/shell/:/shell/
    ports:
      - 55433:55433 # pg protocol handler
--- a/docker-compose/ext-src/pg_jsonschema-src/Makefile
+++ b/docker-compose/ext-src/pg_jsonschema-src/Makefile
@@ -0,0 +1,8 @@
+EXTENSION = pg_jsonschema
+DATA = pg_jsonschema--1.0.sql
+REGRESS = jsonschema_valid_api  jsonschema_edge_cases
+REGRESS_OPTS = --load-extension=pg_jsonschema
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
--- a/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out
+++ b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_edge_cases.out
@@ -0,0 +1,87 @@
+-- Schema with enums, nulls, extra properties disallowed
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+    "email": { "type": ["string", "null"], "format": "email" }
+  },
+  "required": ["status"],
+  "additionalProperties": false
+}'::json);
+ jsonschema_is_valid 
+---------------------
+ t
+(1 row)
+
+-- Valid enum and null email
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": null}'::json
+);
+ jsonschema_validation_errors 
+------------------------------
+ {}
+(1 row)
+
+-- Invalid enum value
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "disabled", "email": null}'::json
+);
+                     jsonschema_validation_errors                     
+----------------------------------------------------------------------
+ {"\"disabled\" is not one of [\"active\",\"inactive\",\"pending\"]"}
+(1 row)
+
+-- Invalid email format (assuming format is validated)
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": "not-an-email"}'::json
+);
+      jsonschema_validation_errors       
+-----------------------------------------
+ {"\"not-an-email\" is not a \"email\""}
+(1 row)
+
+-- Extra property not allowed
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "extra": "should not be here"}'::json
+);
+                    jsonschema_validation_errors                    
+--------------------------------------------------------------------
+ {"Additional properties are not allowed ('extra' was unexpected)"}
+(1 row)
+
--- a/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out
+++ b/docker-compose/ext-src/pg_jsonschema-src/expected/jsonschema_valid_api.out
@@ -0,0 +1,65 @@
+-- Define schema
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "username": { "type": "string" },
+    "age": { "type": "integer" }
+  },
+  "required": ["username"]
+}'::json);
+ jsonschema_is_valid 
+---------------------
+ t
+(1 row)
+
+-- Valid instance
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "alice", "age": 25}'::json
+);
+ jsonschema_validation_errors 
+------------------------------
+ {}
+(1 row)
+
+-- Invalid instance: missing required "username"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"age": 25}'::json
+);
+      jsonschema_validation_errors       
+-----------------------------------------
+ {"\"username\" is a required property"}
+(1 row)
+
+-- Invalid instance: wrong type for "age"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "bob", "age": "twenty"}'::json
+);
+       jsonschema_validation_errors        
+-------------------------------------------
+ {"\"twenty\" is not of type \"integer\""}
+(1 row)
+
--- a/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql
+++ b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_edge_cases.sql
@@ -0,0 +1,66 @@
+-- Schema with enums, nulls, extra properties disallowed
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+    "email": { "type": ["string", "null"], "format": "email" }
+  },
+  "required": ["status"],
+  "additionalProperties": false
+}'::json);
+
+-- Valid enum and null email
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": null}'::json
+);
+
+-- Invalid enum value
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "disabled", "email": null}'::json
+);
+
+-- Invalid email format (assuming format is validated)
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "email": "not-an-email"}'::json
+);
+
+-- Extra property not allowed
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "status": { "type": "string", "enum": ["active", "inactive", "pending"] },
+      "email": { "type": ["string", "null"], "format": "email" }
+    },
+    "required": ["status"],
+    "additionalProperties": false
+  }'::json,
+  '{"status": "active", "extra": "should not be here"}'::json
+);
--- a/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql
+++ b/docker-compose/ext-src/pg_jsonschema-src/sql/jsonschema_valid_api.sql
@@ -0,0 +1,48 @@
+-- Define schema
+SELECT jsonschema_is_valid('{
+  "type": "object",
+  "properties": {
+    "username": { "type": "string" },
+    "age": { "type": "integer" }
+  },
+  "required": ["username"]
+}'::json);
+
+-- Valid instance
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "alice", "age": 25}'::json
+);
+
+-- Invalid instance: missing required "username"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"age": 25}'::json
+);
+
+-- Invalid instance: wrong type for "age"
+SELECT jsonschema_validation_errors(
+  '{
+    "type": "object",
+    "properties": {
+      "username": { "type": "string" },
+      "age": { "type": "integer" }
+    },
+    "required": ["username"]
+  }'::json,
+  '{"username": "bob", "age": "twenty"}'::json
+);
--- a/docker-compose/ext-src/pg_session_jwt-src/Makefile
+++ b/docker-compose/ext-src/pg_session_jwt-src/Makefile
@@ -0,0 +1,9 @@
+EXTENSION = pg_session_jwt
+
+REGRESS = basic_functions
+REGRESS_OPTS = --load-extension=$(EXTENSION)
+export PGOPTIONS = -c pg_session_jwt.jwk={"crv":"Ed25519","kty":"OKP","x":"R_Abz-63zJ00l-IraL5fQhwkhGVZCSooQFV5ntC3C7M"}
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
--- a/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
+++ b/docker-compose/ext-src/pg_session_jwt-src/expected/basic_functions.out
@@ -0,0 +1,35 @@
+-- Basic functionality tests for pg_session_jwt
+-- Test auth.init() function
+SELECT auth.init();
+ init 
+------
+ 
+(1 row)
+
+-- Test an invalid JWT
+SELECT auth.jwt_session_init('INVALID-JWT');
+ERROR:  invalid JWT encoding
+-- Test creating a session with an expired JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
+ERROR:  Token used after it has expired
+-- Test creating a session with a valid JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
+ jwt_session_init 
+------------------
+ 
+(1 row)
+
+-- Test auth.session() function
+SELECT auth.session();
+                                 session                                 
+-------------------------------------------------------------------------
+ {"exp": 4896164252, "iat": 1742564252, "jti": 434343, "sub": "user123"}
+(1 row)
+
+-- Test auth.user_id() function
+SELECT auth.user_id() AS user_id;
+ user_id 
+---------
+ user123
+(1 row)
+
--- a/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql
+++ b/docker-compose/ext-src/pg_session_jwt-src/sql/basic_functions.sql
@@ -0,0 +1,19 @@
+-- Basic functionality tests for pg_session_jwt
+
+-- Test auth.init() function
+SELECT auth.init();
+
+-- Test an invalid JWT
+SELECT auth.jwt_session_init('INVALID-JWT');
+
+-- Test creating a session with an expired JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjE3NDI1NjQ0MzIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MjQyNDIsInN1YiI6InVzZXIxMjMifQ.A6FwKuaSduHB9O7Gz37g0uoD_U9qVS0JNtT7YABGVgB7HUD1AMFc9DeyhNntWBqncg8k5brv-hrNTuUh5JYMAw');
+
+-- Test creating a session with a valid JWT
+SELECT auth.jwt_session_init('eyJhbGciOiJFZERTQSJ9.eyJleHAiOjQ4OTYxNjQyNTIsImlhdCI6MTc0MjU2NDI1MiwianRpIjo0MzQzNDMsInN1YiI6InVzZXIxMjMifQ.2TXVgjb6JSUq6_adlvp-m_SdOxZSyGS30RS9TLB0xu2N83dMSs2NybwE1NMU8Fb0tcAZR_ET7M2rSxbTrphfCg');
+
+-- Test auth.session() function
+SELECT auth.session();
+
+-- Test auth.user_id() function
+SELECT auth.user_id() AS user_id;
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -14,6 +14,32 @@ pub struct GenericAPIError {
    pub error: String,
 }

+/// All configuration parameters necessary for a compute. When
+/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
+/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
+/// and contains parameters necessary for operating `compute_ctl` independently
+/// of whether a tenant is attached to the compute or not.
+///
+/// This also happens to be the body of `compute_ctl`'s /configure request.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct ComputeConfig {
+    /// The compute spec
+    pub spec: Option<ComputeSpec>,
+
+    /// The compute_ctl configuration
+    #[allow(dead_code)]
+    pub compute_ctl_config: ComputeCtlConfig,
+}
+
+impl From<ControlPlaneConfigResponse> for ComputeConfig {
+    fn from(value: ControlPlaneConfigResponse) -> Self {
+        Self {
+            spec: value.spec,
+            compute_ctl_config: value.compute_ctl_config,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct ExtensionInstallResponse {
    pub extension: PgIdent,
@@ -134,7 +160,7 @@ pub struct CatalogObjects {
    pub databases: Vec<Database>,
 }

-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct ComputeCtlConfig {
    /// Set of JSON web keys that the compute can use to authenticate
    /// communication from the control plane.
@@ -153,7 +179,7 @@ impl Default for ComputeCtlConfig {
    }
 }

-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
 pub struct TlsConfig {
    pub key_path: String,
    pub cert_path: String,
@@ -161,7 +187,7 @@ pub struct TlsConfig {

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 #[derive(Deserialize, Debug)]
-pub struct ControlPlaneSpecResponse {
+pub struct ControlPlaneConfigResponse {
    pub spec: Option<ComputeSpec>,
    pub status: ControlPlaneComputeStatus,
    pub compute_ctl_config: ComputeCtlConfig,
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -1,8 +1,8 @@
-//! `ComputeSpec` represents the contents of the spec.json file.
-//!
-//! The spec.json file is used to pass information to 'compute_ctl'. It contains
-//! all the information needed to start up the right version of PostgreSQL,
-//! and connect it to the storage nodes.
+//! The ComputeSpec contains all the information needed to start up
+//! the right version of PostgreSQL, and connect it to the storage nodes.
+//! It can be passed as part of the `config.json`, or the control plane can
+//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
+//! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;

 use indexmap::IndexMap;
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
    pub timeline_id: Option<TimelineId>,
    pub pageserver_connstring: Option<String>,

+    // More neon ids that we expose to the compute_ctl
+    // and to postgres as neon extension GUCs.
+    pub project_id: Option<String>,
+    pub branch_id: Option<String>,
+    pub endpoint_id: Option<String>,
+
    /// Safekeeper membership config generation. It is put in
    /// neon.safekeepers GUC and serves two purposes:
    /// 1) Non zero value forces walproposer to use membership configurations.
@@ -159,13 +165,7 @@ pub struct ComputeSpec {
    #[serde(default)] // Default false
    pub drop_subscriptions_before_start: bool,

-    /// Log level for audit logging:
-    ///
-    /// Disabled - no audit logging. This is the default.
-    /// log - log masked statements to the postgres log using pgaudit extension
-    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
-    ///
-    /// Extensions should be present in shared_preload_libraries
+    /// Log level for compute audit logging
    #[serde(default)]
    pub audit_log_level: ComputeAudit,

@@ -289,14 +289,25 @@ impl ComputeMode {
 }

 /// Log level for audit logging
-/// Disabled, log, hipaa
-/// Default is Disabled
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
    #[default]
    Disabled,
+    // Deprecated, use Base instead
    Log,
+    // (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
+    // logged to the standard postgresql log stream
+    Base,
+    // Deprecated, use Full or Extended instead
    Hipaa,
+    // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access
+    Extended,
+    // (pgaudit.log='all', pgaudit.log_parameter='on'),
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access.
+    Full,
 }

 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -14,6 +14,7 @@ futures.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
 jemalloc_pprof.workspace = true
+jsonwebtoken.workspace = true
 once_cell.workspace = true
 pprof.workspace = true
 regex.workspace = true
@@ -30,6 +31,7 @@ tokio.workspace = true
 tracing.workspace = true
 url.workspace = true
 uuid.workspace = true
+x509-cert.workspace = true

 # to use tokio channels as streams, this is faster to compile than async_stream
 # why is it only here? no other crate should use it, streams are rarely needed.
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -8,6 +8,7 @@ use bytes::{Bytes, BytesMut};
 use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
 use hyper::http::HeaderValue;
 use hyper::{Body, Method, Request, Response};
+use jsonwebtoken::TokenData;
 use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
 use once_cell::sync::Lazy;
 use pprof::ProfilerGuardBuilder;
@@ -618,7 +619,7 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth.decode(token).map_err(|err| {
+                    let data: TokenData<Claims> = auth.decode(token).map_err(|err| {
                        warn!("Authentication error: {err}");
                        // Rely on From<AuthError> for ApiError impl
                        err
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -4,6 +4,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use hyper0::Body;
 use hyper0::server::conn::Http;
+use metrics::{IntCounterVec, register_int_counter_vec};
+use once_cell::sync::Lazy;
 use routerify::{RequestService, RequestServiceBuilder};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
@@ -26,6 +28,24 @@ pub struct Server {
    tls_acceptor: Option<TlsAcceptor>,
 }

+static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_started_total",
+        "Number of established http/https connections",
+        &["scheme"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_errors_total",
+        "Number of occured connection errors by type",
+        &["type"]
+    )
+    .expect("failed to define a metric")
+});
+
 impl Server {
    pub fn new(
        request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -60,6 +80,15 @@ impl Server {
            false
        }

+        let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
+        let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
+        let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
+        let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
+        let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
+
+        let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
+        let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
+
        let mut connections = FuturesUnordered::new();
        loop {
            tokio::select! {
@@ -67,6 +96,7 @@ impl Server {
                    let (tcp_stream, remote_addr) = match stream {
                        Ok(stream) => stream,
                        Err(err) => {
+                            tcp_error_cnt.inc();
                            if !suppress_io_error(&err) {
                                info!("Failed to accept TCP connection: {err:#}");
                            }
@@ -78,11 +108,18 @@ impl Server {
                    let tls_acceptor = self.tls_acceptor.clone();
                    let cancel = cancel.clone();

+                    let tls_error_cnt = tls_error_cnt.clone();
+                    let http_error_cnt = http_error_cnt.clone();
+                    let https_error_cnt = https_error_cnt.clone();
+                    let http_connection_cnt = http_connection_cnt.clone();
+                    let https_connection_cnt = https_connection_cnt.clone();
+
                    connections.push(tokio::spawn(
                        async move {
                            match tls_acceptor {
                                Some(tls_acceptor) => {
                                    // Handle HTTPS connection.
+                                    https_connection_cnt.inc();
                                    let tls_stream = tokio::select! {
                                        tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
                                        _ = cancel.cancelled() => return,
@@ -90,6 +127,7 @@ impl Server {
                                    let tls_stream = match tls_stream {
                                        Ok(tls_stream) => tls_stream,
                                        Err(err) => {
+                                            tls_error_cnt.inc();
                                            if !suppress_io_error(&err) {
                                                info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                            }
@@ -97,6 +135,7 @@ impl Server {
                                        }
                                    };
                                    if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
+                                        https_error_cnt.inc();
                                        if !suppress_hyper_error(&err) {
                                            info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                        }
@@ -104,7 +143,9 @@ impl Server {
                                }
                                None => {
                                    // Handle HTTP connection.
+                                    http_connection_cnt.inc();
                                    if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
+                                        http_error_cnt.inc();
                                        if !suppress_hyper_error(&err) {
                                            info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                        }
@@ -115,6 +156,7 @@ impl Server {
                 }
                Some(conn) = connections.next() => {
                    if let Err(err) = conn {
+                        panic_error_cnt.inc();
                        error!("Connection panicked: {err:#}");
                    }
                }
@@ -122,6 +164,7 @@ impl Server {
                    // Wait for graceful shutdown of all connections.
                    while let Some(conn) = connections.next().await {
                        if let Err(err) = conn {
+                            panic_error_cnt.inc();
                            error!("Connection panicked: {err:#}");
                        }
                    }
--- a/libs/http-utils/src/tls_certs.rs
+++ b/libs/http-utils/src/tls_certs.rs
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
 use anyhow::Context;
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
+use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
+use once_cell::sync::Lazy;
 use rustls::{
-    pki_types::{CertificateDer, PrivateKeyDer},
+    pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
    server::{ClientHello, ResolvesServerCert},
    sign::CertifiedKey,
 };
+use x509_cert::der::Reader;

 pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
    let cert_data = tokio::fs::read(filename)
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
    Ok(certified_key)
 }

+/// rustls's CertifiedKey with extra parsed fields used for metrics.
+struct ParsedCertifiedKey {
+    certified_key: CertifiedKey,
+    expiration_time: UnixTime,
+}
+
+/// Parse expiration time from an X509 certificate.
+fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
+    let parsed_cert = x509_cert::der::SliceReader::new(cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    Ok(UnixTime::since_unix_epoch(
+        parsed_cert
+            .tbs_certificate
+            .validity
+            .not_after
+            .to_unix_duration(),
+    ))
+}
+
+async fn load_and_parse_certified_key(
+    key_filename: &Utf8Path,
+    cert_filename: &Utf8Path,
+) -> anyhow::Result<ParsedCertifiedKey> {
+    let certified_key = load_certified_key(key_filename, cert_filename).await?;
+    let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
+    Ok(ParsedCertifiedKey {
+        certified_key,
+        expiration_time,
+    })
+}
+
+static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "tls_certs_expiration_time_seconds",
+        "Expiration time of the loaded certificate since unix epoch in seconds",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_started_total",
+        "Number of certificate reload loop iterations started",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_updated_total",
+        "Number of times the certificate was updated to the new one",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_failed_total",
+        "Number of times the certificate reload failed",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
 /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
 /// the disk periodically.
 #[derive(Debug)]
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
 impl ReloadingCertificateResolver {
    /// Creates a new Resolver by loading certificate and private key from FS and
    /// creating tokio::task to reload them with provided reload_period.
+    /// resolver_name is used as metric's label.
    pub async fn new(
+        resolver_name: &str,
        key_filename: &Utf8Path,
        cert_filename: &Utf8Path,
        reload_period: Duration,
    ) -> anyhow::Result<Arc<Self>> {
+        // Create metrics for current resolver.
+        let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
+        let cert_reload_started_counter =
+            CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_updated_counter =
+            CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_failed_counter =
+            CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
+
+        let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
+
        let this = Arc::new(Self {
-            certified_key: ArcSwap::from_pointee(
-                load_certified_key(key_filename, cert_filename).await?,
-            ),
+            certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
        });
+        cert_expiration_time.set(parsed_key.expiration_time.as_secs());

        tokio::spawn({
            let weak_this = Arc::downgrade(&this);
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
                        Some(this) => this,
                        None => break, // Resolver has been destroyed, exit.
                    };
-                    match load_certified_key(&key_filename, &cert_filename).await {
-                        Ok(new_certified_key) => {
-                            if new_certified_key.cert == this.certified_key.load().cert {
+                    cert_reload_started_counter.inc();
+
+                    match load_and_parse_certified_key(&key_filename, &cert_filename).await {
+                        Ok(parsed_key) => {
+                            if parsed_key.certified_key.cert == this.certified_key.load().cert {
                                tracing::debug!("Certificate has not changed since last reloading");
                            } else {
                                tracing::info!("Certificate has been reloaded");
-                                this.certified_key.store(Arc::new(new_certified_key));
+                                this.certified_key.store(Arc::new(parsed_key.certified_key));
+                                cert_expiration_time.set(parsed_key.expiration_time.as_secs());
+                                cert_reload_updated_counter.inc();
                            }
                            last_reload_failed = false;
                        }
                        Err(err) => {
+                            cert_reload_failed_counter.inc();
                            // Note: Reloading certs may fail if it conflicts with the script updating
                            // the files at the same time. Warn only if the error is persistent.
                            if last_reload_failed {
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -76,14 +76,7 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
    mfs
 }

-static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "libmetrics_disk_io_bytes_total",
-        "Bytes written and read from disk, grouped by the operation (read|write)",
-        &["io_operation"]
-    )
-    .expect("Failed to register disk i/o bytes int gauge vec")
-});
+

 static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
@@ -261,12 +254,7 @@ const BYTES_IN_BLOCK: i64 = 512;
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

-    DISK_IO_BYTES
-        .with_label_values(&["read"])
-        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
-    DISK_IO_BYTES
-        .with_label_values(&["write"])
-        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
+    

    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
    #[cfg(target_os = "macos")]
@@ -357,10 +345,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }

-    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
-        res[0] = self.inc.remove_label_values(vals);
-        res[1] = self.dec.remove_label_values(vals);
-    }
+    
 }

 impl<P: Atomic> GenericCounterPair<P> {
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -180,6 +180,7 @@ pub struct ConfigToml {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generate_unarchival_heatmap: Option<bool>,
    pub tracing: Option<Tracing>,
+    pub enable_tls_page_service_api: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -206,6 +207,10 @@ pub struct PageServicePipeliningConfigPipelined {
    /// Causes runtime errors if larger than max get_vectored batch size.
    pub max_batch_size: NonZeroUsize,
    pub execution: PageServiceProtocolPipelinedExecutionStrategy,
+    // The default below is such that new versions of the software can start
+    // with the old configuration.
+    #[serde(default)]
+    pub batching: PageServiceProtocolPipelinedBatchingStrategy,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -215,6 +220,19 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
    Tasks,
 }

+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum PageServiceProtocolPipelinedBatchingStrategy {
+    /// All get page requests in a batch will be at the same LSN
+    #[default]
+    UniformLsn,
+    /// Get page requests in a batch may be at different LSN
+    ///
+    /// One key cannot be present more than once at different LSNs in
+    /// the same batch.
+    ScatteredLsn,
+}
+
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
@@ -361,6 +379,8 @@ pub struct TenantConfigToml {
    /// size exceeds `compaction_upper_limit * checkpoint_distance`.
    pub compaction_upper_limit: usize,
    pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    /// If true, enable shard ancestor compaction (enabled by default).
+    pub compaction_shard_ancestor: bool,
    /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0
    /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true.
    pub compaction_l0_first: bool,
@@ -451,6 +471,8 @@ pub struct TenantConfigToml {
    // gc-compaction related configs
    /// Enable automatic gc-compaction trigger on this tenant.
    pub gc_compaction_enabled: bool,
+    /// Enable verification of gc-compaction results.
+    pub gc_compaction_verification: bool,
    /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
    /// gc-compaction will be triggered.
    pub gc_compaction_initial_threshold_kb: u64,
@@ -612,9 +634,12 @@ impl Default for ConfigToml {
            page_service_pipelining: if !cfg!(test) {
                PageServicePipeliningConfig::Serial
            } else {
+                // Do not turn this into the default until scattered reads have been
+                // validated and rolled-out fully.
                PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
                    max_batch_size: NonZeroUsize::new(32).unwrap(),
                    execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
+                    batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
                })
            },
            get_vectored_concurrent_io: if !cfg!(test) {
@@ -631,6 +656,7 @@ impl Default for ConfigToml {
            load_previous_heatmap: None,
            generate_unarchival_heatmap: None,
            tracing: None,
+            enable_tls_page_service_api: false,
        }
    }
 }
@@ -653,6 +679,7 @@ pub mod tenant_conf_defaults {

    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;

    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
@@ -690,6 +717,7 @@ pub mod tenant_conf_defaults {
    // image layers should be created.
    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
+    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
 }
@@ -709,6 +737,7 @@ impl Default for TenantConfigToml {
            compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                kind: DEFAULT_COMPACTION_ALGORITHM,
            },
+            compaction_shard_ancestor: DEFAULT_COMPACTION_SHARD_ANCESTOR,
            compaction_l0_first: DEFAULT_COMPACTION_L0_FIRST,
            compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
            l0_flush_delay_threshold: None,
@@ -744,6 +773,7 @@ impl Default for TenantConfigToml {
            wal_receiver_protocol_override: None,
            rel_size_v2_enabled: false,
            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
+            gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
            sampling_ratio: None,
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;

 use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
 use crate::shard::{ShardStripeSize, TenantShardId};
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
    pub scheduling_policy: SkSchedulingPolicy,
 }

+/// Import request for safekeeper timelines.
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TimelineImportRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub start_lsn: Lsn,
+    pub sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
    use serde_json;
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -927,7 +927,7 @@ impl Key {

    /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
    #[inline(always)]
-    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+    pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
        Ok(match self.field1 {
            0x00 => (
                RelTag {
@@ -938,7 +938,7 @@ impl Key {
                },
                self.field6,
            ),
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+            _ => return Err(ToRelBlockError(self.field1)),
        })
    }
 }
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
    }
 }

+#[derive(Debug)]
+pub struct ToRelBlockError(u8);
+
+impl fmt::Display for ToRelBlockError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "unexpected value kind 0x{:02x}", self.0)
+    }
+}
+
+impl std::error::Error for ToRelBlockError {}
+
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -526,6 +526,8 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_shard_ancestor: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub compaction_l0_first: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub compaction_l0_semaphore: FieldPatch<bool>,
@@ -576,6 +578,8 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_enabled: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_verification: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_ratio_percent: FieldPatch<u64>,
@@ -613,6 +617,9 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub compaction_shard_ancestor: Option<bool>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    pub compaction_l0_first: Option<bool>,

@@ -696,6 +703,9 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub gc_compaction_enabled: Option<bool>,

+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_verification: Option<bool>,
+
    #[serde(skip_serializing_if = "Option::is_none")]
    pub gc_compaction_initial_threshold_kb: Option<u64>,

@@ -719,6 +729,7 @@ impl TenantConfig {
            mut compaction_threshold,
            mut compaction_upper_limit,
            mut compaction_algorithm,
+            mut compaction_shard_ancestor,
            mut compaction_l0_first,
            mut compaction_l0_semaphore,
            mut l0_flush_delay_threshold,
@@ -744,6 +755,7 @@ impl TenantConfig {
            mut wal_receiver_protocol_override,
            mut rel_size_v2_enabled,
            mut gc_compaction_enabled,
+            mut gc_compaction_verification,
            mut gc_compaction_initial_threshold_kb,
            mut gc_compaction_ratio_percent,
            mut sampling_ratio,
@@ -766,6 +778,9 @@ impl TenantConfig {
            .compaction_upper_limit
            .apply(&mut compaction_upper_limit);
        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch
+            .compaction_shard_ancestor
+            .apply(&mut compaction_shard_ancestor);
        patch.compaction_l0_first.apply(&mut compaction_l0_first);
        patch
            .compaction_l0_semaphore
@@ -835,6 +850,9 @@ impl TenantConfig {
        patch
            .gc_compaction_enabled
            .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_verification
+            .apply(&mut gc_compaction_verification);
        patch
            .gc_compaction_initial_threshold_kb
            .apply(&mut gc_compaction_initial_threshold_kb);
@@ -851,6 +869,7 @@ impl TenantConfig {
            compaction_threshold,
            compaction_upper_limit,
            compaction_algorithm,
+            compaction_shard_ancestor,
            compaction_l0_first,
            compaction_l0_semaphore,
            l0_flush_delay_threshold,
@@ -876,6 +895,7 @@ impl TenantConfig {
            wal_receiver_protocol_override,
            rel_size_v2_enabled,
            gc_compaction_enabled,
+            gc_compaction_verification,
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
            sampling_ratio,
@@ -910,6 +930,9 @@ impl TenantConfig {
                .as_ref()
                .unwrap_or(&global_conf.compaction_algorithm)
                .clone(),
+            compaction_shard_ancestor: self
+                .compaction_shard_ancestor
+                .unwrap_or(global_conf.compaction_shard_ancestor),
            compaction_l0_first: self
                .compaction_l0_first
                .unwrap_or(global_conf.compaction_l0_first),
@@ -974,6 +997,9 @@ impl TenantConfig {
            gc_compaction_enabled: self
                .gc_compaction_enabled
                .unwrap_or(global_conf.gc_compaction_enabled),
+            gc_compaction_verification: self
+                .gc_compaction_verification
+                .unwrap_or(global_conf.gc_compaction_verification),
            gc_compaction_initial_threshold_kb: self
                .gc_compaction_initial_threshold_kb
                .unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -23,7 +23,6 @@ use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::{StreamExt, TryStreamExt};
 use http_types::{StatusCode, Url};
-use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use utils::backoff;
@@ -32,7 +31,7 @@ use utils::backoff::exponential_backoff_duration_seconds;
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use crate::config::AzureConfig;
 use crate::error::Cancelled;
-use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
+use crate::metrics::RequestKind;
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
    ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
@@ -165,7 +164,7 @@ impl AzureBlobStorage {
        let mut last_modified = None;
        let mut metadata = HashMap::new();

-        let started_at = start_measuring_requests(kind);
+      

        let download = async {
            let response = builder
@@ -237,19 +236,8 @@ impl AzureBlobStorage {
                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
            },
        };
-        let started_at = ScopeGuard::into_inner(started_at);
-        let outcome = match &download {
-            Ok(_) => AttemptOutcome::Ok,
-            // At this level in the stack 404 and 304 responses do not indicate an error.
-            // There's expected cases when a blob may not exist or hasn't been modified since
-            // the last get (e.g. probing for timeline indices and heatmap downloads).
-            // Callers should handle errors if they are unexpected.
-            Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        };
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, outcome, started_at);
+        
+      
        download
    }

@@ -431,7 +419,7 @@ impl RemoteStorage for AzureBlobStorage {
        let kind = RequestKind::Head;
        let _permit = self.permit(kind, cancel).await?;

-        let started_at = start_measuring_requests(kind);
+      

        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
        let properties_future = blob_client.get_properties().into_future();
@@ -443,12 +431,9 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        if let Ok(inner) = &res {
-            // do not incl. timeouts as errors in metrics but cancellations
-            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, inner, started_at);
+        if let Ok(_inner) = &res {
+            
+         
        }

        let data = match res {
@@ -476,7 +461,7 @@ impl RemoteStorage for AzureBlobStorage {
        let kind = RequestKind::Put;
        let _permit = self.permit(kind, cancel).await?;

-        let started_at = start_measuring_requests(kind);
+      

        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -509,14 +494,7 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        let outcome = match res {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        };
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, outcome, started_at);
+      

        res
    }
@@ -562,7 +540,7 @@ impl RemoteStorage for AzureBlobStorage {
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Delete;
        let _permit = self.permit(kind, cancel).await?;
-        let started_at = start_measuring_requests(kind);
+    

        let op = async {
            // TODO batch requests are not supported by the SDK
@@ -628,10 +606,8 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
+       
+        
        res
    }

@@ -647,7 +623,7 @@ impl RemoteStorage for AzureBlobStorage {
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Copy;
        let _permit = self.permit(kind, cancel).await?;
-        let started_at = start_measuring_requests(kind);
+  

        let timeout = tokio::time::sleep(self.timeout);

@@ -701,10 +677,8 @@ impl RemoteStorage for AzureBlobStorage {
            },
        };

-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
+        
+      
        res
    }

--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -1,9 +1,7 @@
-use metrics::{
-    Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec,
-};
-use once_cell::sync::Lazy;

-pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
+
+
+

 #[derive(Clone, Copy, Debug)]
 pub(crate) enum RequestKind {
@@ -16,62 +14,9 @@ pub(crate) enum RequestKind {
    Head = 6,
 }

-use RequestKind::*;
+
 use scopeguard::ScopeGuard;

-impl RequestKind {
-    const fn as_str(&self) -> &'static str {
-        match self {
-            Get => "get_object",
-            Put => "put_object",
-            Delete => "delete_object",
-            List => "list_objects",
-            Copy => "copy_object",
-            TimeTravel => "time_travel_recover",
-            Head => "head_object",
-        }
-    }
-    const fn as_index(&self) -> usize {
-        *self as usize
-    }
-}
-
-const REQUEST_KIND_COUNT: usize = 7;
-pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
-
-impl<C> RequestTyped<C> {
-    pub(crate) fn get(&self, kind: RequestKind) -> &C {
-        &self.0[kind.as_index()]
-    }
-
-    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
-        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
-        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
-            let next = it.next().unwrap();
-            assert_eq!(index, next.as_index());
-            f(next)
-        });
-
-        if let Some(next) = it.next() {
-            panic!("unexpected {next:?}");
-        }
-
-        RequestTyped(arr)
-    }
-}
-
-impl RequestTyped<Histogram> {
-    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
-        self.get(kind).observe(started_at.elapsed().as_secs_f64())
-    }
-}
-
-pub(crate) struct PassFailCancelledRequestTyped<C> {
-    success: RequestTyped<C>,
-    fail: RequestTyped<C>,
-    cancelled: RequestTyped<C>,
-}

 #[derive(Debug, Clone, Copy)]
 pub(crate) enum AttemptOutcome {
@@ -89,138 +34,22 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
    }
 }

-impl AttemptOutcome {
-    pub(crate) fn as_str(&self) -> &'static str {
-        match self {
-            AttemptOutcome::Ok => "ok",
-            AttemptOutcome::Err => "err",
-            AttemptOutcome::Cancelled => "cancelled",
-        }
-    }
-}

-impl<C> PassFailCancelledRequestTyped<C> {
-    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
-        let target = match outcome {
-            AttemptOutcome::Ok => &self.success,
-            AttemptOutcome::Err => &self.fail,
-            AttemptOutcome::Cancelled => &self.cancelled,
-        };
-        target.get(kind)
-    }

-    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
-        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
-        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
-        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));

-        PassFailCancelledRequestTyped {
-            success,
-            fail,
-            cancelled,
-        }
-    }
-}

-impl PassFailCancelledRequestTyped<Histogram> {
-    pub(crate) fn observe_elapsed(
-        &self,
-        kind: RequestKind,
-        outcome: impl Into<AttemptOutcome>,
-        started_at: std::time::Instant,
-    ) {
-        self.get(kind, outcome.into())
-            .observe(started_at.elapsed().as_secs_f64())
-    }
-}

-/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
-pub(crate) fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        crate::metrics::BUCKET_METRICS
-            .cancelled_waits
-            .get(kind)
-            .inc()
-    })
-}
+
+

 /// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
 pub(crate) fn start_measuring_requests(
-    kind: RequestKind,
+    _kind: RequestKind,
 ) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_started_at| {
+        
    })
 }

-pub(crate) struct BucketMetrics {
-    /// Full request duration until successful completion, error or cancellation.
-    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
-    /// Total amount of seconds waited on queue.
-    pub(crate) wait_seconds: RequestTyped<Histogram>,

-    /// Track how many semaphore awaits were cancelled per request type.
-    ///
-    /// This is in case cancellations are happening more than expected.
-    pub(crate) cancelled_waits: RequestTyped<IntCounter>,

-    /// Total amount of deleted objects in batches or single requests.
-    pub(crate) deleted_objects_total: IntCounter,
-}
-
-impl Default for BucketMetrics {
-    fn default() -> Self {
-        // first bucket 100 microseconds to count requests that do not need to wait at all
-        // and get a permit immediately
-        let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
-
-        let req_seconds = register_histogram_vec!(
-            "remote_storage_s3_request_seconds",
-            "Seconds to complete a request",
-            &["request_type", "result"],
-            buckets.to_vec(),
-        )
-        .unwrap();
-        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
-            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
-        });
-
-        let wait_seconds = register_histogram_vec!(
-            "remote_storage_s3_wait_seconds",
-            "Seconds rate limited",
-            &["request_type"],
-            buckets.to_vec(),
-        )
-        .unwrap();
-        let wait_seconds =
-            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
-
-        let cancelled_waits = register_int_counter_vec!(
-            "remote_storage_s3_cancelled_waits_total",
-            "Times a semaphore wait has been cancelled per request type",
-            &["request_type"],
-        )
-        .unwrap();
-        let cancelled_waits =
-            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
-
-        let deleted_objects_total = register_int_counter!(
-            "remote_storage_s3_deleted_objects_total",
-            "Amount of deleted objects in total",
-        )
-        .unwrap();
-
-        Self {
-            req_seconds,
-            wait_seconds,
-            cancelled_waits,
-            deleted_objects_total,
-        }
-    }
-}
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -41,7 +41,7 @@ use super::StorageMetadata;
 use crate::config::S3Config;
 use crate::error::Cancelled;
 pub(super) use crate::metrics::RequestKind;
-use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests};
+use crate::metrics::{AttemptOutcome, start_measuring_requests};
 use crate::support::PermitCarrying;
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
@@ -199,7 +199,7 @@ impl S3Bucket {
        kind: RequestKind,
        cancel: &CancellationToken,
    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
-        let started_at = start_counting_cancelled_wait(kind);
+       
        let acquire = self.concurrency_limiter.acquire(kind);

        let permit = tokio::select! {
@@ -207,10 +207,8 @@ impl S3Bucket {
            _ = cancel.cancelled() => return Err(Cancelled),
        };

-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .wait_seconds
-            .observe_elapsed(kind, started_at);
+       
+        

        Ok(permit)
    }
@@ -220,7 +218,7 @@ impl S3Bucket {
        kind: RequestKind,
        cancel: &CancellationToken,
    ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
-        let started_at = start_counting_cancelled_wait(kind);
+       
        let acquire = self.concurrency_limiter.acquire_owned(kind);

        let permit = tokio::select! {
@@ -228,10 +226,8 @@ impl S3Bucket {
            _ = cancel.cancelled() => return Err(Cancelled),
        };

-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .wait_seconds
-            .observe_elapsed(kind, started_at);
+       
+        
        Ok(permit)
    }

@@ -273,11 +269,7 @@ impl S3Bucket {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Ok,
-                    started_at,
-                );
+                
                return Err(DownloadError::NotFound);
            }
            Err(SdkError::ServiceError(e))
@@ -287,19 +279,11 @@ impl S3Bucket {
                if e.raw().status().as_u16() == StatusCode::NotModified =>
            {
                // Count an unmodified file as a success.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Ok,
-                    started_at,
-                );
+               
                return Err(DownloadError::Unmodified);
            }
            Err(e) => {
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Err,
-                    started_at,
-                );
+                

                return Err(DownloadError::Other(
                    anyhow::Error::new(e).context("download s3 object"),
@@ -346,11 +330,11 @@ impl S3Bucket {
        delete_objects: &[ObjectIdentifier],
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
+   
        let mut cancel = std::pin::pin!(cancel.cancelled());

        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
-            let started_at = start_measuring_requests(kind);
+           

            let req = self
                .client
@@ -370,15 +354,10 @@ impl S3Bucket {
                _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
            };

-            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
+          
+            
            let resp = resp.context("request deletion")?;
-            crate::metrics::BUCKET_METRICS
-                .deleted_objects_total
-                .inc_by(chunk.len() as u64);
+            

            if let Some(errors) = resp.errors {
                // Log a bounded number of the errors within the response:
@@ -445,8 +424,8 @@ pin_project_lite::pin_project! {
    }

    impl<S> PinnedDrop for TimedDownload<S> {
-        fn drop(mut this: Pin<&mut Self>) {
-            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+        fn drop(mut _this: Pin<&mut Self>) {
+           
        }
    }
 }
@@ -511,7 +490,7 @@ impl RemoteStorage for S3Bucket {

            let mut continuation_token = None;
            'outer: loop {
-                let started_at = start_measuring_requests(kind);
+           

                // min of two Options, returning Some if one is value and another is
                // None (None is smaller than anything, so plain min doesn't work).
@@ -544,11 +523,9 @@ impl RemoteStorage for S3Bucket {
                    .context("Failed to list S3 prefixes")
                    .map_err(DownloadError::Other);

-                let started_at = ScopeGuard::into_inner(started_at);
+               

-                crate::metrics::BUCKET_METRICS
-                    .req_seconds
-                    .observe_elapsed(kind, &response, started_at);
+                

                let response = match response {
                    Ok(response) => response,
@@ -629,7 +606,7 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Head;
        let _permit = self.permit(kind, cancel).await?;

-        let started_at = start_measuring_requests(kind);
+     

        let head_future = self
            .client
@@ -648,30 +625,18 @@ impl RemoteStorage for S3Bucket {
        let res = res.map_err(|_e| DownloadError::Timeout)?;

        // do not incl. timeouts as errors in metrics but cancellations
-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
+  
+        
        let data = match res {
            Ok(object_output) => object_output,
            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Ok,
-                    started_at,
-                );
                return Err(DownloadError::NotFound);
            }
            Err(e) => {
-                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                    kind,
-                    AttemptOutcome::Err,
-                    started_at,
-                );
+                

                return Err(DownloadError::Other(
                    anyhow::Error::new(e).context("s3 head object"),
@@ -704,7 +669,7 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Put;
        let _permit = self.permit(kind, cancel).await?;

-        let started_at = start_measuring_requests(kind);
+      

        let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
        let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
@@ -727,12 +692,10 @@ impl RemoteStorage for S3Bucket {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        if let Ok(inner) = &res {
+        if let Ok(_inner) = &res {
            // do not incl. timeouts as errors in metrics but cancellations
-            let started_at = ScopeGuard::into_inner(started_at);
-            crate::metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, inner, started_at);
+       
+            
        }

        match res {
@@ -753,7 +716,7 @@ impl RemoteStorage for S3Bucket {

        let timeout = tokio::time::sleep(self.timeout);

-        let started_at = start_measuring_requests(kind);
+       

        // we need to specify bucket_name as a prefix
        let copy_source = format!(
@@ -777,10 +740,8 @@ impl RemoteStorage for S3Bucket {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        let started_at = ScopeGuard::into_inner(started_at);
-        crate::metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
+       
+        

        res?;

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -29,6 +29,7 @@ futures = { workspace = true }
 jsonwebtoken.workspace = true
 nix = { workspace = true, features = ["ioctl"] }
 once_cell.workspace = true
+pem.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
 serde.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -11,7 +11,8 @@ use camino::Utf8Path;
 use jsonwebtoken::{
    Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode,
 };
-use serde::{Deserialize, Serialize};
+use pem::Pem;
+use serde::{Deserialize, Serialize, de::DeserializeOwned};

 use crate::id::TenantId;

@@ -73,7 +74,10 @@ impl SwappableJwtAuth {
    pub fn swap(&self, jwt_auth: JwtAuth) {
        self.0.swap(Arc::new(jwt_auth));
    }
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+    pub fn decode<D: DeserializeOwned>(
+        &self,
+        token: &str,
+    ) -> std::result::Result<TokenData<D>, AuthError> {
        self.0.load().decode(token)
    }
 }
@@ -148,7 +152,10 @@ impl JwtAuth {
    /// The function tries the stored decoding keys in succession,
    /// and returns the first yielding a successful result.
    /// If there is no working decoding key, it returns the last error.
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+    pub fn decode<D: DeserializeOwned>(
+        &self,
+        token: &str,
+    ) -> std::result::Result<TokenData<D>, AuthError> {
        let mut res = None;
        for decoding_key in &self.decoding_keys {
            res = Some(decode(token, decoding_key, &self.validation));
@@ -173,8 +180,8 @@ impl std::fmt::Debug for JwtAuth {
 }

 // this function is used only for testing purposes in CLI e g generate tokens during init
-pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
-    let key = EncodingKey::from_ed_pem(key_data)?;
+pub fn encode_from_key_file<S: Serialize>(claims: &S, pem: &Pem) -> Result<String> {
+    let key = EncodingKey::from_ed_der(pem.contents());
    Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
 }

@@ -188,13 +195,13 @@ mod tests {
    //
    // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
    // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
-    const TEST_PUB_KEY_ED25519: &[u8] = br#"
+    const TEST_PUB_KEY_ED25519: &str = r#"
 -----BEGIN PUBLIC KEY-----
 MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
 -----END PUBLIC KEY-----
 "#;

-    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
+    const TEST_PRIV_KEY_ED25519: &str = r#"
 -----BEGIN PRIVATE KEY-----
 MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 -----END PRIVATE KEY-----
@@ -222,9 +229,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH

        // Check it can be validated with the public key
        let auth = JwtAuth::new(vec![
-            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
+            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
        ]);
-        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
+        let claims_from_token: Claims = auth.decode(encoded_eddsa).unwrap().claims;
        assert_eq!(claims_from_token, expected_claims);
    }

@@ -235,13 +242,14 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
+        let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap();
+        let encoded = encode_from_key_file(&claims, &pem).unwrap();

        // decode it back
        let auth = JwtAuth::new(vec![
-            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap(),
+            DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519.as_bytes()).unwrap(),
        ]);
-        let decoded = auth.decode(&encoded).unwrap();
+        let decoded: TokenData<Claims> = auth.decode(&encoded).unwrap();

        assert_eq!(decoded.claims, claims);
    }
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -1,7 +1,6 @@
 use std::fmt::Display;
 use std::time::{Duration, Instant};

-use metrics::IntCounter;

 /// Circuit breakers are for operations that are expensive and fallible.
 ///
@@ -54,7 +53,7 @@ impl CircuitBreaker {
        }
    }

-    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
+    pub fn fail<E>(&mut self,  error: E)
    where
        E: Display,
    {
@@ -64,18 +63,18 @@ impl CircuitBreaker {

        self.fail_count += 1;
        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
-            self.break_circuit(metric, error);
+            self.break_circuit( error);
        }
    }

    /// Call this after successfully executing an operation
-    pub fn success(&mut self, metric: &IntCounter) {
+    pub fn success(&mut self) {
        self.fail_count = 0;
        if let Some(broken_at) = &self.broken_at {
            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
                humantime::format_duration(broken_at.elapsed()));
            self.broken_at = None;
-            metric.inc();
+          
        }
    }

@@ -98,13 +97,13 @@ impl CircuitBreaker {
        }
    }

-    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
+    fn break_circuit<E>(&mut self,  error: E)
    where
        E: Display,
    {
        self.broken_at = Some(Instant::now());
        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
-        metric.inc();
+        
    }

    fn reset_circuit(&mut self) {
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -10,6 +10,8 @@ default = []
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]

+fuzz-read-path = ["testing"]
+
 [dependencies]
 anyhow.workspace = true
 arc-swap.workspace = true
@@ -33,6 +35,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
+jsonwebtoken.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -126,7 +126,7 @@ async fn ingest(
            max_concurrency: NonZeroUsize::new(1).unwrap(),
        });
        let (_desc, path) = layer
-            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone())
            .await?
            .unwrap();
        tokio::fs::remove_file(path).await?;
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
 use crate::tenant::{PageReconstructError, Timeline};

 #[derive(Debug, thiserror::Error)]
@@ -353,9 +353,10 @@ where
            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);

            for part in slru_partitions.parts {
+                let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
                let blocks = self
                    .timeline
-                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
+                    .get_vectored(query, self.io_concurrency.clone(), self.ctx)
                    .await?;

                for (key, block) in blocks {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,7 +20,6 @@ use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
-use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::{
    BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
 };
@@ -321,10 +320,9 @@ where
    }
 }

-fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
+fn startup_checkpoint(started_at: Instant, _phase: &str, human_phase: &str) {
    let elapsed = started_at.elapsed();
    let secs = elapsed.as_secs_f64();
-    STARTUP_DURATION.with_label_values(&[phase]).set(secs);

    info!(
        elapsed_ms = elapsed.as_millis(),
@@ -355,10 +353,7 @@ fn start_pageserver(
    set_launch_timestamp_metric(launch_ts);
    #[cfg(target_os = "linux")]
    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
-    metrics::register_internal(Box::new(
-        pageserver::metrics::tokio_epoll_uring::Collector::new(),
-    ))
-    .unwrap();
+    
    pageserver::preinitialize_metrics(conf, ignored);

    // If any failpoints were set from FAILPOINTS environment variable,
@@ -452,6 +447,24 @@ fn start_pageserver(
    info!("Using auth for http API: {:#?}", conf.http_auth_type);
    info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);

+    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
+    {
+        let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
+            "main",
+            &conf.ssl_key_file,
+            &conf.ssl_cert_file,
+            conf.ssl_cert_reload_period,
+        ))?;
+
+        let server_config = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_cert_resolver(resolver);
+
+        Some(Arc::new(server_config))
+    } else {
+        None
+    };
+
    match var("NEON_AUTH_TOKEN") {
        Ok(v) => {
            info!("Loaded JWT token for authentication with Safekeeper");
@@ -484,7 +497,6 @@ fn start_pageserver(
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
-    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
    //
@@ -560,7 +572,6 @@ fn start_pageserver(
                    "initial_tenant_load",
                    "Initial load completed",
                );
-                STARTUP_IS_LOADING.set(0);
            });

            let WaitForPhaseResult {
@@ -670,17 +681,11 @@ fn start_pageserver(

        let https_task = match https_listener {
            Some(https_listener) => {
-                let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
-                    &conf.ssl_key_file,
-                    &conf.ssl_cert_file,
-                    conf.ssl_cert_reload_period,
-                ))?;
+                let tls_server_config = tls_server_config
+                    .clone()
+                    .expect("tls_server_config is set earlier if https is enabled");

-                let server_config = rustls::ServerConfig::builder()
-                    .with_no_client_auth()
-                    .with_cert_resolver(resolver);
-
-                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);

                let server =
                    http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -736,6 +741,11 @@ fn start_pageserver(
            tokio::net::TcpListener::from_std(pageserver_listener)
                .context("create tokio listener")?
        },
+        if conf.enable_tls_page_service_api {
+            tls_server_config
+        } else {
+            None
+        },
    );

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -219,6 +219,11 @@ pub struct PageServerConf {
    pub generate_unarchival_heatmap: bool,

    pub tracing: Option<pageserver_api::config::Tracing>,
+
+    /// Enable TLS in page service API.
+    /// Does not force TLS: the client negotiates TLS usage during the handshake.
+    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
+    pub enable_tls_page_service_api: bool,
 }

 /// Token for authentication to safekeepers
@@ -391,6 +396,7 @@ impl PageServerConf {
            load_previous_heatmap,
            generate_unarchival_heatmap,
            tracing,
+            enable_tls_page_service_api,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -441,6 +447,7 @@ impl PageServerConf {
            page_service_pipelining,
            get_vectored_concurrent_io,
            tracing,
+            enable_tls_page_service_api,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -261,7 +261,7 @@ where
    let mut tenants = std::pin::pin!(tenants);

    while let Some((tenant_id, tenant)) = tenants.next().await {
-        let mut tenant_resident_size = 0;
+        let tenant_resident_size = 0;

        for timeline in tenant.list_timelines() {
            let timeline_id = timeline.timeline_id;
@@ -286,7 +286,6 @@ where
                }
            }

-            tenant_resident_size += timeline.resident_physical_size();
        }

        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -91,12 +91,12 @@

 use std::{sync::Arc, time::Duration};

-use once_cell::sync::Lazy;
+
 use tracing::warn;
 use utils::{id::TimelineId, shard::TenantShardId};

 use crate::{
-    metrics::{StorageIoSizeMetrics, TimelineMetrics},
+    metrics::TimelineMetrics,
    task_mgr::TaskKind,
    tenant::Timeline,
 };
@@ -122,38 +122,35 @@ pub struct RequestContext {
 #[derive(Clone)]
 pub(crate) enum Scope {
    Global {
-        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+        
    },
    SecondaryTenant {
-        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+       
    },
    SecondaryTimeline {
-        io_size_metrics: crate::metrics::StorageIoSizeMetrics,
+       
    },
    Timeline {
-        // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
+       // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
        // context creation contending for the ref counters of the Arc<TimelineMetrics>,
        // which are shared among all tasks that operate on the timeline, especially
        // concurrent page_service connections.
        #[allow(clippy::redundant_allocation)]
-        arc_arc: Arc<Arc<TimelineMetrics>>,
-    },
+        #[allow(dead_code)]
+        arc_arc: Arc<Arc<TimelineMetrics>>,    },
    #[cfg(test)]
    UnitTest {
-        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+       
    },
    DebugTools {
-        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
+        
    },
 }

-static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
-    Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));

 impl Scope {
    pub(crate) fn new_global() -> Self {
        Scope::Global {
-            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
    /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
@@ -173,18 +170,13 @@ impl Scope {
        }
    }
    pub(crate) fn new_secondary_timeline(
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
+        _tenant_shard_id: &TenantShardId,
+        _timeline_id: &TimelineId,
    ) -> Self {
        // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.

-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let shard_id = tenant_shard_id.shard_slug().to_string();
-        let timeline_id = timeline_id.to_string();

-        let io_size_metrics =
-            crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
-        Scope::SecondaryTimeline { io_size_metrics }
+        Scope::SecondaryTimeline {  }
    }
    pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
        // Before propagating metrics via RequestContext, the labels were inferred from file path.
@@ -197,19 +189,19 @@ impl Scope {
        // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
        // at this point, so, we were able to completely side-step tenant-scoped stuff there).
        Scope::SecondaryTenant {
-            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+           
        }
    }
    #[cfg(test)]
    pub(crate) fn new_unit_test() -> Self {
        Scope::UnitTest {
-            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+          
        }
    }

    pub(crate) fn new_debug_tools() -> Self {
        Scope::DebugTools {
-            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
+          
        }
    }
 }
@@ -523,58 +515,18 @@ impl RequestContext {
        self.access_stats_behavior
    }

-    pub(crate) fn page_content_kind(&self) -> PageContentKind {
-        self.page_content_kind
-    }

    pub(crate) fn read_path_debug(&self) -> bool {
        self.read_path_debug
    }

-    pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
-        match &self.scope {
-            Scope::Global { io_size_metrics } => {
-                let is_unit_test = cfg!(test);
-                let is_regress_test_build = cfg!(feature = "testing");
-                if is_unit_test || is_regress_test_build {
-                    panic!("all VirtualFile instances are timeline-scoped");
-                } else {
-                    use once_cell::sync::Lazy;
-                    use std::sync::Mutex;
-                    use std::time::Duration;
-                    use utils::rate_limit::RateLimit;
-                    static LIMIT: Lazy<Mutex<RateLimit>> =
-                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
-                    let mut guard = LIMIT.lock().unwrap();
-                    guard.call2(|rate_limit_stats| {
-                        warn!(
-                            %rate_limit_stats,
-                            backtrace=%std::backtrace::Backtrace::force_capture(),
-                            "all VirtualFile instances are timeline-scoped",
-                        );
-                    });
-
-                    io_size_metrics
-                }
-            }
-            Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
-            Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
-            Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
-            #[cfg(test)]
-            Scope::UnitTest { io_size_metrics } => io_size_metrics,
-            Scope::DebugTools { io_size_metrics } => io_size_metrics,
-        }
-    }
-
    pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) {
        if duration == Duration::ZERO {
            return;
        }

        match &self.scope {
-            Scope::Timeline { arc_arc } => arc_arc
-                .wait_ondemand_download_time
-                .observe(self.task_kind, duration),
+            Scope::Timeline { arc_arc: _ } => {},
            _ => {
                use once_cell::sync::Lazy;
                use std::sync::Mutex;
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -27,7 +27,6 @@ use self::list_writer::{DeletionOp, ListWriter, RecoverOp};
 use self::validator::Validator;
 use crate::config::PageServerConf;
 use crate::controller_upcall_client::StorageControllerUpcallApi;
-use crate::metrics;
 use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -163,11 +162,6 @@ struct TenantDeletionList {
    generation: Generation,
 }

-impl TenantDeletionList {
-    pub(crate) fn len(&self) -> usize {
-        self.timelines.values().map(|v| v.len()).sum()
-    }
-}

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
@@ -467,9 +461,6 @@ impl DeletionQueueClient {
        // they may be historical.
        assert!(!current_generation.is_none());

-        metrics::DELETION_QUEUE
-            .keys_submitted
-            .inc_by(layers.len() as u64);
        self.do_push(
            &self.tx,
            ListWriterQueueMessage::Delete(DeletionOp {
@@ -553,9 +544,6 @@ impl DeletionQueueClient {
        &self,
        objects: Vec<RemotePath>,
    ) -> Result<(), DeletionQueueError> {
-        metrics::DELETION_QUEUE
-            .keys_submitted
-            .inc_by(objects.len() as u64);
        self.executor_tx
            .send(DeleterMessage::Delete(objects))
            .await
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -14,7 +14,6 @@ use tracing::{info, warn};
 use utils::{backoff, pausable_failpoint};

 use super::{DeletionQueueError, FlushOp};
-use crate::metrics;

 const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);

@@ -60,10 +59,6 @@ impl Deleter {
                fail::fail_point!("deletion-queue-before-execute", |_| {
                    info!("Skipping execution, failpoint set");

-                    metrics::DELETION_QUEUE
-                        .remote_errors
-                        .with_label_values(&["failpoint"])
-                        .inc();
                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                });

@@ -90,9 +85,6 @@ impl Deleter {
                Ok(()) => {
                    // Note: we assume that the remote storage layer returns Ok(()) if some
                    // or all of the deleted objects were already gone.
-                    metrics::DELETION_QUEUE
-                        .keys_executed
-                        .inc_by(self.accumulator.len() as u64);
                    info!(
                        "Executed deletion batch {}..{}",
                        self.accumulator
@@ -109,10 +101,6 @@ impl Deleter {
                        return Err(DeletionQueueError::ShuttingDown);
                    }
                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
-                    metrics::DELETION_QUEUE
-                        .remote_errors
-                        .with_label_values(&["execute"])
-                        .inc();
                }
            };
        }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -25,7 +25,6 @@ use utils::id::TimelineId;
 use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage};
 use crate::config::PageServerConf;
 use crate::deletion_queue::TEMP_SUFFIX;
-use crate::metrics;
 use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error};
@@ -152,7 +151,7 @@ impl ListWriter {
                }
            }
            Err(e) => {
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                
                warn!(
                    sequence = self.pending.sequence,
                    "Failed to write deletion list, will retry later ({e:#})"
@@ -180,7 +179,6 @@ impl ListWriter {
                        // This should never happen unless we make a mistake with our serialization.
                        // Ignoring a deletion header is not consequential for correctnes because all deletions
                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
-                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        Ok(None)
                    }
                }
@@ -249,7 +247,6 @@ impl ListWriter {
                    .as_str()
            } else {
                warn!("Unexpected key in deletion queue: {basename}");
-                metrics::DELETION_QUEUE.unexpected_errors.inc();
                continue;
            };

@@ -257,7 +254,6 @@ impl ListWriter {
                Ok(s) => s,
                Err(e) => {
                    warn!("Malformed key '{basename}': {e}");
-                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                    continue;
                }
            };
@@ -286,7 +282,6 @@ impl ListWriter {
                    // Drop the list on the floor: any objects it referenced will be left behind
                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
-                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                    continue;
                }
            };
@@ -329,9 +324,6 @@ impl ListWriter {

            // We will drop out of recovery if this fails: it indicates that we are shutting down
            // or the backend has panicked
-            metrics::DELETION_QUEUE
-                .keys_submitted
-                .inc_by(deletion_list.len() as u64);
            self.tx
                .send(ValidatorQueueMessage::Delete(deletion_list))
                .await?;
@@ -353,7 +345,6 @@ impl ListWriter {
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
            );
-            metrics::DELETION_QUEUE.unexpected_errors.inc();
            return;
        }

@@ -422,7 +413,6 @@ impl ListWriter {
                            tracing::error!(
                                "Failed to enqueue deletions, leaking objects.  This is a bug."
                            );
-                            metrics::DELETION_QUEUE.unexpected_errors.inc();
                        }
                    }
                }
@@ -450,7 +440,6 @@ impl ListWriter {
                        tracing::error!(
                            "Deletion queue recovery called more than once.  This is a bug."
                        );
-                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
                        continue;
                    }
@@ -462,7 +451,6 @@ impl ListWriter {
                        info!(
                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
                        );
-                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        return;
                    } else {
                        self.recovered = true;
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -26,7 +26,6 @@ use super::deleter::DeleterMessage;
 use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates};
 use crate::config::PageServerConf;
 use crate::controller_upcall_client::{RetryForeverError, StorageControllerUpcallApi};
-use crate::metrics;
 use crate::virtual_file::MaybeFatalIo;

 // After this length of time, do any validation work that is pending,
@@ -186,7 +185,6 @@ where
                    "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}",
                    tenant_lsn_state.generation
                );
-                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
            }
        }

@@ -221,11 +219,8 @@ where

                if !this_list_valid {
                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
-                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
-                } else {
-                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
-                }
+                } 
                this_list_valid
            });
            list.validated = true;
@@ -237,7 +232,7 @@ where
                    // Highly unexpected.  Could happen if e.g. disk full.
                    // If we didn't save the trimmed list, it is _not_ valid to execute.
                    warn!("Failed to save modified deletion list {list}: {e:#}");
-                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    

                    // Rather than have a complex retry process, just drop it and leak the objects,
                    // scrubber will clean up eventually.
@@ -276,7 +271,7 @@ where
                // The save() function logs a warning on error.
                if let Err(e) = header.save(self.conf).await {
                    warn!("Failed to write deletion queue header: {e:#}");
-                    metrics::DELETION_QUEUE.unexpected_errors.inc();
+                    
                }
            }
        }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -56,7 +56,6 @@ use utils::completion;
 use utils::id::TimelineId;

 use crate::config::PageServerConf;
-use crate::metrics::disk_usage_based_eviction::METRICS;
 use crate::task_mgr::{self, BACKGROUND_RUNTIME};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
@@ -388,7 +387,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        }
    };

-    METRICS.layers_collected.inc_by(candidates.len() as u64);
+

    tracing::info!(
        elapsed_ms = collection_time.as_millis(),
@@ -428,7 +427,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    let (evicted_amount, usage_planned) =
        select_victims(&candidates, usage_pre).into_amount_and_planned();

-    METRICS.layers_selected.inc_by(evicted_amount as u64);
+ 

    // phase2: evict layers

@@ -457,7 +456,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            if let Some(next) = next {
                match next {
                    Ok(Ok(file_size)) => {
-                        METRICS.layers_evicted.inc();
                        usage_assumed.add_available_bytes(file_size);
                    }
                    Ok(Err((
@@ -788,7 +786,6 @@ async fn collect_eviction_candidates(
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
-    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);

    // get a snapshot of the list of tenants
    let tenants = tenant_manager
@@ -822,7 +819,7 @@ async fn collect_eviction_candidates(
            continue;
        }

-        let started_at = std::time::Instant::now();
+        

        // collect layers from all timelines in this tenant
        //
@@ -917,25 +914,11 @@ async fn collect_eviction_candidates(
                    (partition, candidate)
                });

-        METRICS
-            .tenant_layer_count
-            .observe(tenant_candidates.len() as f64);

        candidates.extend(tenant_candidates);

-        let elapsed = started_at.elapsed();
-        METRICS
-            .tenant_collection_time
-            .observe(elapsed.as_secs_f64());
+       

-        if elapsed > LOG_DURATION_THRESHOLD {
-            tracing::info!(
-                tenant_id=%tenant.tenant_shard_id().tenant_id,
-                shard_id=%tenant.tenant_shard_id().shard_slug(),
-                elapsed_ms = elapsed.as_millis(),
-                "collection took longer than threshold"
-            );
-        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -962,7 +945,7 @@ async fn collect_eviction_candidates(
            layer_info.resident_layers.len()
        );

-        let started_at = std::time::Instant::now();
+       

        layer_info
            .resident_layers
@@ -984,28 +967,13 @@ async fn collect_eviction_candidates(
                        candidate,
                    )
                });
-
-        METRICS
-            .tenant_layer_count
-            .observe(tenant_candidates.len() as f64);
        candidates.extend(tenant_candidates);

        tokio::task::yield_now().await;

-        let elapsed = started_at.elapsed();
+    

-        METRICS
-            .tenant_collection_time
-            .observe(elapsed.as_secs_f64());
-
-        if elapsed > LOG_DURATION_THRESHOLD {
-            tracing::info!(
-                tenant_id=%tenant.tenant_shard_id().tenant_id,
-                shard_id=%tenant.tenant_shard_id().shard_slug(),
-                elapsed_ms = elapsed.as_millis(),
-                "collection took longer than threshold"
-            );
-        }
+        
    }

    debug_assert!(
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
        modification
            .put_file(&fname, content.as_bytes(), &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
    }
    modification
        .commit(&ctx)
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -24,10 +24,9 @@ use wal_decoder::models::InterpretedWalRecord;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
-use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
-use crate::walingest::WalIngest;
+use crate::walingest::{WalIngest, WalIngestErrorKind};

 // Returns checkpoint LSN from controlfile
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +156,9 @@ async fn import_rel(
        .put_rel_creation(rel, nblocks as u32, ctx)
        .await
    {
-        match e {
-            RelationError::AlreadyExists => {
-                debug!("Relation {} already exist. We must be extending it.", rel)
+        match e.kind {
+            WalIngestErrorKind::RelationAlreadyExists(rel) => {
+                debug!("Relation {rel} already exists. We must be extending it.")
            }
            _ => return Err(e.into()),
        }
@@ -324,7 +323,6 @@ async fn import_wal(
                walingest
                    .ingest_record(interpreted, &mut modification, ctx)
                    .await?;
-                WAL_INGEST.records_committed.inc();

                modification.commit(ctx).await?;
                last_lsn = lsn;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -77,7 +77,6 @@ use anyhow::Context;
 use once_cell::sync::OnceCell;

 use crate::context::RequestContext;
-use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics};
 use crate::virtual_file::{IoBufferMut, IoPageSlice};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -195,7 +194,7 @@ impl SlotInner {
 }

 pub struct PageCache {
-    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
+    immutable_page_maps: [std::sync::RwLock<HashMap<(FileId, u32), usize>>; 16],

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -205,8 +204,103 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
+}

-    size_metrics: &'static PageCacheSizeMetrics,
+impl PageCache {
+    /// Helper function to determine the shard index based on the low 4 bits of the u32 in the key tuple.
+    fn shard_index(_file_id: &FileId, blkno: u32) -> usize {
+        (blkno & 0xF) as usize
+    }
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Returns the slot index, if any.
+    ///
+    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
+    /// get recycled for an unrelated page immediately after this function
+    /// returns. The caller is responsible for re-checking that the slot still
+    /// contains the page with the same key before using it.
+    ///
+    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
+        match cache_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let shard_idx = Self::shard_index(file_id, *blkno);
+                let map = self.immutable_page_maps[shard_idx].read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    ///
+    /// Remove mapping for given key.
+    ///
+    fn remove_mapping(&self, old_key: &CacheKey) {
+        match old_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let shard_idx = Self::shard_index(file_id, *blkno);
+                let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+            }
+        }
+    }
+
+    ///
+    /// Insert mapping for given key.
+    ///
+    /// If a mapping already existed for the given key, returns the slot index
+    /// of the existing mapping and leaves it untouched.
+    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
+        match new_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let shard_idx = Self::shard_index(file_id, *blkno);
+                let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        None
+                    }
+                }
+            }
+        }
+    }
+
+    /// Initialize a new page cache
+    ///
+    /// This should be called only once at page server startup.
+    fn new(num_pages: usize) -> Self {
+        assert!(num_pages > 0, "page cache size must be > 0");
+
+        // We could use Vec::leak here, but that potentially also leaks
+        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
+        // this is avoided.
+        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
+
+        let slots = page_buffer
+            .chunks_exact_mut(PAGE_SZ)
+            .map(|chunk| {
+                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
+                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
+
+                Slot {
+                    inner: tokio::sync::RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        permit: std::sync::Mutex::new(Weak::new()),
+                    }),
+                    usage_count: AtomicU8::new(0),
+                }
+            })
+            .collect();
+
+        Self {
+            immutable_page_maps: Default::default(),
+            slots,
+            next_evict_slot: AtomicUsize::new(0),
+            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
+        }
+    }
 }

 struct PinnedSlotsPermit {
@@ -414,32 +508,17 @@ impl PageCache {
    async fn lock_for_read(
        &self,
        cache_key: &CacheKey,
-        ctx: &RequestContext,
+        _ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut permit = Some(self.try_get_pinned_slot_permit().await?);

-        let (read_access, hit) = match cache_key {
-            CacheKey::ImmutableFilePage { .. } => (
-                &crate::metrics::PAGE_CACHE
-                    .for_ctx(ctx)
-                    .read_accesses_immutable,
-                &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
-            ),
-        };
-        read_access.inc();
-
-        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
                debug_assert!(permit.is_none());
-                if is_first_iteration {
-                    hit.inc();
-                }
                return Ok(ReadBufResult::Found(read_guard));
            }
            debug_assert!(permit.is_some());
-            is_first_iteration = false;

            // Not found. Find a victim buffer
            let (slot_idx, mut inner) = self
@@ -484,63 +563,6 @@ impl PageCache {
        }
    }

-    //
-    // Section 3: Mapping functions
-    //
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Returns the slot index, if any.
-    ///
-    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
-    /// get recycled for an unrelated page immediately after this function
-    /// returns.  The caller is responsible for re-checking that the slot still
-    /// contains the page with the same key before using it.
-    ///
-    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
-        match cache_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let map = self.immutable_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    ///
-    /// Remove mapping for given key.
-    ///
-    fn remove_mapping(&self, old_key: &CacheKey) {
-        match old_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let mut map = self.immutable_page_map.write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
-            }
-        }
-    }
-
-    ///
-    /// Insert mapping for given key.
-    ///
-    /// If a mapping already existed for the given key, returns the slot index
-    /// of the existing mapping and leaves it untouched.
-    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
-        match new_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let mut map = self.immutable_page_map.write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
-                        None
-                    }
-                }
-            }
-        }
-    }
-
    //
    // Section 4: Misc internal helpers
    //
@@ -595,11 +617,7 @@ impl PageCache {
                            // Note that just yielding to tokio during iteration without such
                            // priority boosting is likely counter-productive. We'd just give more opportunities
                            // for B to bump usage count, further starving A.
-                            page_cache_eviction_metrics::observe(
-                                page_cache_eviction_metrics::Outcome::ItersExceeded {
-                                    iters: iters.try_into().unwrap(),
-                                },
-                            );
+                            
                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
@@ -609,84 +627,12 @@ impl PageCache {
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
                    inner.key = None;
-                    page_cache_eviction_metrics::observe(
-                        page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
-                            iters: iters.try_into().unwrap(),
-                        },
-                    );
-                } else {
-                    page_cache_eviction_metrics::observe(
-                        page_cache_eviction_metrics::Outcome::FoundSlotUnused {
-                            iters: iters.try_into().unwrap(),
-                        },
-                    );
-                }
+                    
+                } 
                return Ok((slot_idx, inner));
            }
        }
    }

-    /// Initialize a new page cache
-    ///
-    /// This should be called only once at page server startup.
-    fn new(num_pages: usize) -> Self {
-        assert!(num_pages > 0, "page cache size must be > 0");
-
-        // We could use Vec::leak here, but that potentially also leaks
-        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
-        // this is avoided.
-        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
-
-        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
-        size_metrics.max_bytes.set_page_sz(num_pages);
-        size_metrics.current_bytes_immutable.set_page_sz(0);
-
-        let slots = page_buffer
-            .chunks_exact_mut(PAGE_SZ)
-            .map(|chunk| {
-                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
-                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
-
-                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        permit: std::sync::Mutex::new(Weak::new()),
-                    }),
-                    usage_count: AtomicU8::new(0),
-                }
-            })
-            .collect();
-
-        Self {
-            immutable_page_map: Default::default(),
-            slots,
-            next_evict_slot: AtomicUsize::new(0),
-            size_metrics,
-            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
-        }
-    }
 }

-trait PageSzBytesMetric {
-    fn set_page_sz(&self, count: usize);
-    fn add_page_sz(&self, count: usize);
-    fn sub_page_sz(&self, count: usize);
-}
-
-#[inline(always)]
-fn count_times_page_sz(count: usize) -> u64 {
-    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
-}
-
-impl PageSzBytesMetric for metrics::UIntGauge {
-    fn set_page_sz(&self, count: usize) {
-        self.set(count_times_page_sz(count));
-    }
-    fn add_page_sz(&self, count: usize) {
-        self.add(count_times_page_sz(count));
-    }
-    fn sub_page_sz(&self, count: usize) {
-        self.sub(count_times_page_sz(count));
-    }
-}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -15,10 +15,11 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
 use itertools::Itertools;
+use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy,
+    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::models::{
@@ -58,8 +59,7 @@ use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
 use crate::metrics::{
-    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
-    TimelineMetrics,
+    self, GetPageBatchBreakReason, SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -105,6 +105,7 @@ pub fn spawn(
    pg_auth: Option<Arc<SwappableJwtAuth>>,
    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
 ) -> Listener {
    let cancel = CancellationToken::new();
    let libpq_ctx = RequestContext::todo_child(
@@ -124,6 +125,7 @@ pub fn spawn(
            perf_trace_dispatch,
            tcp_listener,
            conf.pg_auth_type,
+            tls_config,
            conf.page_service_pipelining.clone(),
            libpq_ctx,
            cancel.clone(),
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
    perf_trace_dispatch: Option<Dispatch>,
    listener: tokio::net::TcpListener,
    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
                    local_auth,
                    socket,
                    auth_type,
+                    tls_config.clone(),
                    pipelining_config.clone(),
                    connection_ctx,
                    connections_cancel.child_token(),
@@ -264,14 +268,12 @@ async fn page_service_conn_main(
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
    gate_guard: GateGuard,
 ) -> ConnectionHandlerResult {
-    let _guard = LIVE_CONNECTIONS
-        .with_label_values(&["page_service"])
-        .guard();

    socket
        .set_nodelay(true)
@@ -334,7 +336,8 @@ async fn page_service_conn_main(
        cancel.clone(),
        gate_guard,
    );
-    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
+    let pgbackend =
+        PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;

    match pgbackend.run(&mut conn_handler, &cancel).await {
        Ok(()) => {
@@ -634,14 +637,13 @@ impl std::fmt::Display for BatchedPageStreamError {

 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
-    timer: SmgrOpTimer,
+    effective_request_lsn: Lsn,
    ctx: RequestContext,
 }

 #[cfg(feature = "testing")]
 struct BatchedTestRequest {
    req: models::PagestreamTestRequest,
-    timer: SmgrOpTimer,
 }

 /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
@@ -651,31 +653,31 @@ struct BatchedTestRequest {
 enum BatchedFeMessage {
    Exists {
        span: Span,
-        timer: SmgrOpTimer,
+       
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamExistsRequest,
    },
    Nblocks {
        span: Span,
-        timer: SmgrOpTimer,
+     
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamNblocksRequest,
    },
    GetPage {
        span: Span,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
-        effective_request_lsn: Lsn,
        pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
+        batch_break_reason: GetPageBatchBreakReason,
    },
    DbSize {
        span: Span,
-        timer: SmgrOpTimer,
+  
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamDbSizeRequest,
    },
    GetSlruSegment {
        span: Span,
-        timer: SmgrOpTimer,
+   
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamGetSlruSegmentRequest,
    },
@@ -696,26 +698,119 @@ impl BatchedFeMessage {
        self.into()
    }

-    fn observe_execution_start(&mut self, at: Instant) {
-        match self {
-            BatchedFeMessage::Exists { timer, .. }
-            | BatchedFeMessage::Nblocks { timer, .. }
-            | BatchedFeMessage::DbSize { timer, .. }
-            | BatchedFeMessage::GetSlruSegment { timer, .. } => {
-                timer.observe_execution_start(at);
-            }
-            BatchedFeMessage::GetPage { pages, .. } => {
-                for page in pages {
-                    page.timer.observe_execution_start(at);
+    fn observe_execution_start(&mut self, _at: Instant) {
+    }
+
+    fn should_break_batch(
+        &self,
+        other: &BatchedFeMessage,
+        max_batch_size: NonZeroUsize,
+        batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
+    ) -> Option<GetPageBatchBreakReason> {
+        match (self, other) {
+            (
+                BatchedFeMessage::GetPage {
+                    shard: accum_shard,
+                    pages: accum_pages,
+                    ..
+                },
+                BatchedFeMessage::GetPage {
+                    shard: this_shard,
+                    pages: this_pages,
+                    ..
+                },
+            ) => {
+                assert_eq!(this_pages.len(), 1);
+                if accum_pages.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_pages.len(), max_batch_size.get());
+
+                    return Some(GetPageBatchBreakReason::BatchFull);
                }
+                if !accum_shard.is_same_handle_as(this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+
+                    return Some(GetPageBatchBreakReason::NonUniformTimeline);
+                }
+
+                match batching_strategy {
+                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
+                        if let Some(last_in_batch) = accum_pages.last() {
+                            if last_in_batch.effective_request_lsn
+                                != this_pages[0].effective_request_lsn
+                            {
+                                trace!(
+                                    accum_lsn = %last_in_batch.effective_request_lsn,
+                                    this_lsn = %this_pages[0].effective_request_lsn,
+                                    "stopping batching because LSN changed"
+                                );
+
+                                return Some(GetPageBatchBreakReason::NonUniformLsn);
+                            }
+                        }
+                    }
+                    PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
+                        // The read path doesn't curently support serving the same page at different LSNs.
+                        // While technically possible, it's uncertain if the complexity is worth it.
+                        // Break the batch if such a case is encountered.
+                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
+                            batched.req.rel == this_pages[0].req.rel
+                                && batched.req.blkno == this_pages[0].req.blkno
+                                && batched.effective_request_lsn
+                                    != this_pages[0].effective_request_lsn
+                        });
+
+                        if same_page_different_lsn {
+                            trace!(
+                                rel=%this_pages[0].req.rel,
+                                blkno=%this_pages[0].req.blkno,
+                                lsn=%this_pages[0].effective_request_lsn,
+                                "stopping batching because same page was requested at different LSNs"
+                            );
+
+                            return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn);
+                        }
+                    }
+                }
+
+                None
            }
            #[cfg(feature = "testing")]
-            BatchedFeMessage::Test { requests, .. } => {
-                for req in requests {
-                    req.timer.observe_execution_start(at);
+            (
+                BatchedFeMessage::Test {
+                    shard: accum_shard,
+                    requests: accum_requests,
+                    ..
+                },
+                BatchedFeMessage::Test {
+                    shard: this_shard,
+                    requests: this_requests,
+                    ..
+                },
+            ) => {
+                assert!(this_requests.len() == 1);
+                if accum_requests.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_requests.len(), max_batch_size.get());
+                    return Some(GetPageBatchBreakReason::BatchFull);
                }
+                if !accum_shard.is_same_handle_as(this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return Some(GetPageBatchBreakReason::NonUniformTimeline);
+                }
+                let this_batch_key = this_requests[0].req.batch_key;
+                let accum_batch_key = accum_requests[0].req.batch_key;
+                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
+                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
+                    return Some(GetPageBatchBreakReason::NonUniformKey);
+                }
+                None
            }
-            BatchedFeMessage::RespondError { .. } => {}
+            (_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest),
        }
    }
 }
@@ -843,7 +938,7 @@ impl PageServerHandler {
                    .await?;
                debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
                let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                let timer = record_op_start_and_throttle(
+                record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelExists,
                    received_at,
@@ -851,7 +946,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::Exists {
                    span,
-                    timer,
+                
                    shard: shard.downgrade(),
                    req,
                }
@@ -861,7 +956,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                let timer = record_op_start_and_throttle(
+                record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelSize,
                    received_at,
@@ -869,7 +964,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::Nblocks {
                    span,
-                    timer,
+                   
                    shard: shard.downgrade(),
                    req,
                }
@@ -879,7 +974,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                let timer = record_op_start_and_throttle(
+                record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetDbSize,
                    received_at,
@@ -887,7 +982,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::DbSize {
                    span,
-                    timer,
+              
                    shard: shard.downgrade(),
                    req,
                }
@@ -897,7 +992,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                let timer = record_op_start_and_throttle(
+                record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetSlruSegment,
                    received_at,
@@ -905,7 +1000,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::GetSlruSegment {
                    span,
-                    timer,
+               
                    shard: shard.downgrade(),
                    req,
                }
@@ -1004,7 +1099,7 @@ impl PageServerHandler {
                // request handler log messages contain the request-specific fields.
                let span = mkspan!(shard.tenant_shard_id.shard_slug());

-                let timer = record_op_start_and_throttle(
+                record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetPageAtLsn,
                    received_at,
@@ -1019,34 +1114,31 @@ impl PageServerHandler {
                .await?;

                // We're holding the Handle
-                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                let res = Self::wait_or_get_last_lsn(
+                let effective_request_lsn = match Self::effective_request_lsn(
                    &shard,
+                    shard.get_last_record_lsn(),
                    req.hdr.request_lsn,
                    req.hdr.not_modified_since,
                    &shard.get_applied_gc_cutoff_lsn(),
-                    &ctx,
-                )
-                .maybe_perf_instrument(&ctx, |current_perf_span| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        parent: current_perf_span,
-                        "WAIT_LSN",
-                    )
-                })
-                .await;
-
-                let effective_request_lsn = match res {
+                ) {
                    Ok(lsn) => lsn,
                    Err(e) => {
                        return respond_error!(span, e);
                    }
                };
+
                BatchedFeMessage::GetPage {
                    span,
                    shard: shard.downgrade(),
-                    effective_request_lsn,
-                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
+                    pages: smallvec::smallvec![BatchedGetPageRequest {
+                        req,
+                        effective_request_lsn,
+                        ctx,
+                    }],
+                    // The executor grabs the batch when it becomes idle.
+                    // Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the
+                    // default reason for breaking the batch.
+                    batch_break_reason: GetPageBatchBreakReason::ExecutorSteal,
                }
            }
            #[cfg(feature = "testing")]
@@ -1055,13 +1147,12 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
-                let timer =
-                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
+                record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
                        .await?;
                BatchedFeMessage::Test {
                    span,
                    shard: shard.downgrade(),
-                    requests: vec![BatchedTestRequest { req, timer }],
+                    requests: vec![BatchedTestRequest { req,  }],
                }
            }
        };
@@ -1072,6 +1163,7 @@ impl PageServerHandler {
    #[instrument(skip_all, level = tracing::Level::TRACE)]
    #[allow(clippy::boxed_local)]
    fn pagestream_do_batch(
+        batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
        max_batch_size: NonZeroUsize,
        batch: &mut Result<BatchedFeMessage, QueryError>,
        this_msg: Result<BatchedFeMessage, QueryError>,
@@ -1083,90 +1175,59 @@ impl PageServerHandler {
            Err(e) => return Err(Err(e)),
        };

-        match (&mut *batch, this_msg) {
-            // something batched already, let's see if we can add this message to the batch
-            (
-                Ok(BatchedFeMessage::GetPage {
-                    span: _,
-                    shard: accum_shard,
-                    pages: accum_pages,
-                    effective_request_lsn: accum_lsn,
-                }),
-                BatchedFeMessage::GetPage {
-                    span: _,
-                    shard: this_shard,
-                    pages: this_pages,
-                    effective_request_lsn: this_lsn,
-                },
-            ) if (|| {
-                assert_eq!(this_pages.len(), 1);
-                if accum_pages.len() >= max_batch_size.get() {
-                    trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_pages.len(), max_batch_size.get());
-                    return false;
-                }
-                if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-                // the vectored get currently only supports a single LSN, so, bounce as soon
-                // as the effective request_lsn changes
-                if *accum_lsn != this_lsn {
-                    trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
-                    return false;
-                }
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_pages.extend(this_pages);
-                Ok(())
+        let eligible_batch = match batch {
+            Ok(b) => b,
+            Err(_) => {
+                return Err(Ok(this_msg));
            }
-            #[cfg(feature = "testing")]
-            (
-                Ok(BatchedFeMessage::Test {
-                    shard: accum_shard,
-                    requests: accum_requests,
-                    ..
-                }),
-                BatchedFeMessage::Test {
-                    shard: this_shard,
-                    requests: this_requests,
-                    ..
-                },
-            ) if (|| {
-                assert!(this_requests.len() == 1);
-                if accum_requests.len() >= max_batch_size.get() {
-                    trace!(%max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_requests.len(), max_batch_size.get());
-                    return false;
+        };
+
+        let batch_break =
+            eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy);
+
+        match batch_break {
+            Some(reason) => {
+                if let BatchedFeMessage::GetPage {
+                    batch_break_reason, ..
+                } = eligible_batch
+                {
+                    *batch_break_reason = reason;
                }
-                if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!("stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-                let this_batch_key = this_requests[0].req.batch_key;
-                let accum_batch_key = accum_requests[0].req.batch_key;
-                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
-                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
-                    return false;
-                }
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_requests.extend(this_requests);
-                Ok(())
-            }
-            // something batched already but this message is unbatchable
-            (_, this_msg) => {
-                // by default, don't continue batching
+
                Err(Ok(this_msg))
            }
+            None => {
+                // ok to batch
+                match (eligible_batch, this_msg) {
+                    (
+                        BatchedFeMessage::GetPage {
+                            pages: accum_pages, ..
+                        },
+                        BatchedFeMessage::GetPage {
+                            pages: this_pages, ..
+                        },
+                    ) => {
+                        accum_pages.extend(this_pages);
+                        Ok(())
+                    }
+                    #[cfg(feature = "testing")]
+                    (
+                        BatchedFeMessage::Test {
+                            requests: accum_requests,
+                            ..
+                        },
+                        BatchedFeMessage::Test {
+                            requests: this_requests,
+                            ..
+                        },
+                    ) => {
+                        accum_requests.extend(this_requests);
+                        Ok(())
+                    }
+                    // Shape guaranteed by [`BatchedFeMessage::should_break_batch`]
+                    _ => unreachable!(),
+                }
+            }
        }
    }

@@ -1192,7 +1253,7 @@ impl PageServerHandler {

        // Dispatch the batch to the appropriate request handler.
        let log_slow_name = batch.as_static_str();
-        let (mut handler_results, span) = {
+        let (handler_results, span) = {
            // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
            // won't fit on the stack.
            let mut boxpinned =
@@ -1222,31 +1283,31 @@ impl PageServerHandler {
        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
-        let flush_timers = {
-            let flushing_start_time = Instant::now();
-            let mut flush_timers = Vec::with_capacity(handler_results.len());
-            for handler_result in &mut handler_results {
-                let flush_timer = match handler_result {
-                    Ok((_, timer)) => Some(
-                        timer
-                            .observe_execution_end(flushing_start_time)
-                            .expect("we are the first caller"),
-                    ),
-                    Err(_) => {
-                        // TODO: measure errors
-                        None
-                    }
-                };
-                flush_timers.push(flush_timer);
-            }
-            assert_eq!(flush_timers.len(), handler_results.len());
-            flush_timers
-        };
+        // let flush_timers = {
+        //     let flushing_start_time = Instant::now();
+        //     let mut flush_timers = Vec::with_capacity(handler_results.len());
+        //     for handler_result in &mut handler_results {
+        //         let flush_timer = match handler_result {
+        //             Ok((_, timer)) => Some(
+        //                 timer
+        //                     .observe_execution_end(flushing_start_time)
+        //                     .expect("we are the first caller"),
+        //             ),
+        //             Err(_) => {
+        //                 // TODO: measure errors
+        //                 None
+        //             }
+        //         };
+        //         flush_timers.push(flush_timer);
+        //     }
+        //     assert_eq!(flush_timers.len(), handler_results.len());
+        //     flush_timers
+        // };

        // Map handler result to protocol behavior.
        // Some handler errors cause exit from pagestream protocol.
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
+        for handler_result in handler_results.into_iter() {
            let response_msg = match handler_result {
                Err(e) => match &e.err {
                    PageStreamError::Shutdown => {
@@ -1278,7 +1339,7 @@ impl PageServerHandler {
                        })
                    }
                },
-                Ok((response_msg, _op_timer_already_observed)) => response_msg,
+                Ok((response_msg, )) => response_msg,
            };

            //
@@ -1292,17 +1353,17 @@ impl PageServerHandler {
            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);

            // what we want to do
-            let socket_fd = pgb_writer.socket_fd;
+           
            let flush_fut = pgb_writer.flush();
            // metric for how long flushing takes
-            let flush_fut = match flushing_timer {
-                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
-                    Instant::now(),
-                    flush_fut,
-                    socket_fd,
-                )),
-                None => futures::future::Either::Right(flush_fut),
-            };
+            // let flush_fut = match flushing_timer {
+            //     Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+            //         Instant::now(),
+            //         flush_fut,
+            //         socket_fd,
+            //     )),
+            //     None => futures::future::Either::Right(flush_fut),
+            // };
            // do it while respecting cancellation
            let _: () = async move {
                tokio::select! {
@@ -1332,7 +1393,7 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<
        (
-            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
+            Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>>,
            Span,
        ),
        QueryError,
@@ -1348,7 +1409,7 @@ impl PageServerHandler {
        Ok(match batch {
            BatchedFeMessage::Exists {
                span,
-                timer,
+               
                shard,
                req,
            } => {
@@ -1359,7 +1420,7 @@ impl PageServerHandler {
                        self.handle_get_rel_exists_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, timer))
+                            .map(|msg| (msg, ))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1367,7 +1428,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::Nblocks {
                span,
-                timer,
+           
                shard,
                req,
            } => {
@@ -1378,7 +1439,7 @@ impl PageServerHandler {
                        self.handle_get_nblocks_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, timer))
+                            .map(|msg| (msg, ))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1387,8 +1448,8 @@ impl PageServerHandler {
            BatchedFeMessage::GetPage {
                span,
                shard,
-                effective_request_lsn,
                pages,
+                batch_break_reason,
            } => {
                fail::fail_point!("ps::handle-pagerequest-message::getpage");
                let (shard, ctx) = upgrade_handle_and_set_context!(shard);
@@ -1399,9 +1460,9 @@ impl PageServerHandler {
                        let res = self
                            .handle_get_page_at_lsn_request_batched(
                                &shard,
-                                effective_request_lsn,
                                pages,
                                io_concurrency,
+                                batch_break_reason,
                                &ctx,
                            )
                            .instrument(span.clone())
@@ -1414,7 +1475,6 @@ impl PageServerHandler {
            }
            BatchedFeMessage::DbSize {
                span,
-                timer,
                shard,
                req,
            } => {
@@ -1425,7 +1485,7 @@ impl PageServerHandler {
                        self.handle_db_size_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, timer))
+                            .map(|msg| (msg, ))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1433,7 +1493,6 @@ impl PageServerHandler {
            }
            BatchedFeMessage::GetSlruSegment {
                span,
-                timer,
                shard,
                req,
            } => {
@@ -1444,7 +1503,7 @@ impl PageServerHandler {
                        self.handle_get_slru_segment_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, timer))
+                            .map(|msg| (msg, ))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1718,6 +1777,7 @@ impl PageServerHandler {
        let PageServicePipeliningConfigPipelined {
            max_batch_size,
            execution,
+            batching: batching_strategy,
        } = pipelining_config;

        // Macro to _define_ a pipeline stage.
@@ -1769,7 +1829,7 @@ impl PageServerHandler {
                    exit |= read_res.is_err();
                    let could_send = batch_tx
                        .send(read_res, |batch, res| {
-                            Self::pagestream_do_batch(max_batch_size, batch, res)
+                            Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
                        })
                        .await;
                    exit |= could_send.is_err();
@@ -1865,7 +1925,39 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<Lsn, PageStreamError> {
        let last_record_lsn = timeline.get_last_record_lsn();
+        let effective_request_lsn = Self::effective_request_lsn(
+            timeline,
+            last_record_lsn,
+            request_lsn,
+            not_modified_since,
+            latest_gc_cutoff_lsn,
+        )?;

+        if effective_request_lsn > last_record_lsn {
+            timeline
+                .wait_lsn(
+                    not_modified_since,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
+                    ctx,
+                )
+                .await?;
+
+            // Since we waited for 'effective_request_lsn' to arrive, that is now the last
+            // record LSN. (Or close enough for our purposes; the last-record LSN can
+            // advance immediately after we return anyway)
+        }
+
+        Ok(effective_request_lsn)
+    }
+
+    fn effective_request_lsn(
+        timeline: &Timeline,
+        last_record_lsn: Lsn,
+        request_lsn: Lsn,
+        not_modified_since: Lsn,
+        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+    ) -> Result<Lsn, PageStreamError> {
        // Sanity check the request
        if request_lsn < not_modified_since {
            return Err(PageStreamError::BadRequest(
@@ -1900,19 +1992,7 @@ impl PageServerHandler {
            }
        }

-        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
        if not_modified_since > last_record_lsn {
-            timeline
-                .wait_lsn(
-                    not_modified_since,
-                    crate::tenant::timeline::WaitLsnWaiter::PageService,
-                    timeline::WaitLsnTimeout::Default,
-                    ctx,
-                )
-                .await?;
-            // Since we waited for 'not_modified_since' to arrive, that is now the last
-            // record LSN. (Or close enough for our purposes; the last-record LSN can
-            // advance immediately after we return anyway)
            Ok(not_modified_since)
        } else {
            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
@@ -2067,17 +2147,13 @@ impl PageServerHandler {
    async fn handle_get_page_at_lsn_request_batched(
        &mut self,
        timeline: &Timeline,
-        effective_lsn: Lsn,
        requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
        io_concurrency: IoConcurrency,
+        _batch_break_reason: GetPageBatchBreakReason,
        ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>> {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        timeline
-            .query_metrics
-            .observe_getpage_batch_start(requests.len());
-
        // If a page trace is running, submit an event for this request.
        if let Some(page_trace) = timeline.page_trace.load().as_ref() {
            let time = SystemTime::now();
@@ -2086,20 +2162,81 @@ impl PageServerHandler {
                // Ignore error (trace buffer may be full or tracer may have disconnected).
                _ = page_trace.try_send(PageTraceEvent {
                    key,
-                    effective_lsn,
+                    effective_lsn: batch.effective_request_lsn,
                    time,
                });
            }
        }

+        // If any request in the batch needs to wait for LSN, then do so now.
+        let mut perf_instrument = false;
+        let max_effective_lsn = requests
+            .iter()
+            .map(|req| {
+                if req.ctx.has_perf_span() {
+                    perf_instrument = true;
+                }
+
+                req.effective_request_lsn
+            })
+            .max()
+            .expect("batch is never empty");
+
+        let ctx = match perf_instrument {
+            true => RequestContextBuilder::from(ctx)
+                .root_perf_span(|| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        "GET_VECTORED",
+                        tenant_id = %timeline.tenant_shard_id.tenant_id,
+                        timeline_id = %timeline.timeline_id,
+                        shard = %timeline.tenant_shard_id.shard_slug(),
+                        %max_effective_lsn
+                    )
+                })
+                .attached_child(),
+            false => ctx.attached_child(),
+        };
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if max_effective_lsn > last_record_lsn {
+            if let Err(e) = timeline
+                .wait_lsn(
+                    max_effective_lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
+                    &ctx,
+                )
+                .maybe_perf_instrument(&ctx, |current_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: current_perf_span,
+                        "WAIT_LSN",
+                    )
+                })
+                .await
+            {
+                return Vec::from_iter(requests.into_iter().map(|req| {
+                    Err(BatchedPageStreamError {
+                        err: PageStreamError::from(e.clone()),
+                        req: req.req.hdr,
+                    })
+                }));
+            }
+        }
+
        let results = timeline
            .get_rel_page_at_lsn_batched(
-                requests
-                    .iter()
-                    .map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
-                effective_lsn,
+                requests.iter().map(|p| {
+                    (
+                        &p.req.rel,
+                        &p.req.blkno,
+                        p.effective_request_lsn,
+                        p.ctx.attached_child(),
+                    )
+                }),
                io_concurrency,
-                ctx,
+                &ctx,
            )
            .await;
        assert_eq!(results.len(), requests.len());
@@ -2116,7 +2253,7 @@ impl PageServerHandler {
                                req: req.req,
                                page,
                            }),
-                            req.timer,
+                            
                        )
                    })
                    .map_err(|e| BatchedPageStreamError {
@@ -2161,7 +2298,7 @@ impl PageServerHandler {
        timeline: &Timeline,
        requests: Vec<BatchedTestRequest>,
        _ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage,), BatchedPageStreamError>> {
        // real requests would do something with the timeline
        let mut results = Vec::with_capacity(requests.len());
        for _req in requests.iter() {
@@ -2187,7 +2324,6 @@ impl PageServerHandler {
                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
                                req: req.req.clone(),
                            }),
-                            req.timer,
                        )
                    })
                    .map_err(|e| BatchedPageStreamError {
@@ -2667,7 +2803,7 @@ where
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
        // which requires auth to be present
-        let data = self
+        let data: TokenData<Claims> = self
            .auth
            .as_ref()
            .unwrap()
@@ -2742,12 +2878,7 @@ where
                    .record("timeline_id", field::display(timeline_id));

                self.check_permission(Some(tenant_id))?;
-                let command_kind = match protocol_version {
-                    PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
-                    PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
-                };
-                COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
-
+                
                self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
                    .await?;
            }
@@ -2764,10 +2895,7 @@ where

                self.check_permission(Some(tenant_id))?;

-                COMPUTE_COMMANDS_COUNTERS
-                    .for_command(ComputeCommandKind::Basebackup)
-                    .inc();
-                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
+                
                let res = async {
                    self.handle_basebackup_request(
                        pgb,
@@ -2785,7 +2913,7 @@ where
                    Result::<(), QueryError>::Ok(())
                }
                .await;
-                metric_recording.observe(&res);
+
                res?;
            }
            // same as basebackup, but result includes relational data as well
@@ -2801,9 +2929,7 @@ where

                self.check_permission(Some(tenant_id))?;

-                COMPUTE_COMMANDS_COUNTERS
-                    .for_command(ComputeCommandKind::Fullbackup)
-                    .inc();
+                

                // Check that the timeline exists
                self.handle_basebackup_request(
@@ -2837,9 +2963,7 @@ where

                self.check_permission(Some(tenant_shard_id.tenant_id))?;

-                COMPUTE_COMMANDS_COUNTERS
-                    .for_command(ComputeCommandKind::LeaseLsn)
-                    .inc();
+                

                match self
                    .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,14 +6,14 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
-use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
+use std::collections::{HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};

-use crate::PERF_TRACE_TARGET;
-use anyhow::{Context, ensure};
+use crate::walingest::{WalIngestError, WalIngestErrorKind};
+use crate::{PERF_TRACE_TARGET, ensure_walingest};
+use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
    TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
@@ -21,7 +21,7 @@ use pageserver_api::key::{
    repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
-use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
 use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -36,21 +36,19 @@ use tracing::{debug, info, info_span, trace, warn};
 use utils::bin_ser::{BeSer, DeserializeError};
 use utils::lsn::Lsn;
 use utils::pausable_failpoint;
-use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
+use wal_decoder::serialized_batch::SerializedValueBatch ;

 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
-use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
+use crate::context::{PerfInstrumentFutureExt, RequestContext};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::metrics::{
-    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
-};
+
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
 };
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -136,12 +134,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {

 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
-    #[error("Relation Already Exists")]
-    AlreadyExists,
    #[error("invalid relnode")]
    InvalidRelnode,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }

 ///
@@ -210,10 +204,9 @@ impl Timeline {
                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                let res = self
                    .get_rel_page_at_lsn_batched(
-                        pages
-                            .iter()
-                            .map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
-                        effective_lsn,
+                        pages.iter().map(|(tag, blknum)| {
+                            (tag, blknum, effective_lsn, ctx.attached_child())
+                        }),
                        io_concurrency.clone(),
                        ctx,
                    )
@@ -251,8 +244,7 @@ impl Timeline {
    /// The ordering of the returned vec corresponds to the ordering of `pages`.
    pub(crate) async fn get_rel_page_at_lsn_batched(
        &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
-        effective_lsn: Lsn,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
        io_concurrency: IoConcurrency,
        ctx: &RequestContext,
    ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -265,11 +257,13 @@ impl Timeline {
        let mut result = Vec::with_capacity(pages.len());
        let result_slots = result.spare_capacity_mut();

-        let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
-            BTreeMap::default();
+        let mut keys_slots: HashMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
+            HashMap::with_capacity(pages.len());

-        let mut perf_instrument = false;
-        for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
+        let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
+            HashMap::with_capacity(pages.len());
+
+        for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
            if tag.relnode == 0 {
                result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                    RelationError::InvalidRelnode.into(),
@@ -280,14 +274,14 @@ impl Timeline {
            }

            let nblocks = match self
-                .get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
+                .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
                .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                    info_span!(
                        target: PERF_TRACE_TARGET,
                        parent: crnt_perf_span,
                        "GET_REL_SIZE",
                        reltag=%tag,
-                        lsn=%effective_lsn,
+                        lsn=%lsn,
                    )
                })
                .await
@@ -303,7 +297,7 @@ impl Timeline {
            if *blknum >= nblocks {
                debug!(
                    "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                    tag, blknum, effective_lsn, nblocks
+                    tag, blknum, lsn, nblocks
                );
                result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
                slots_filled += 1;
@@ -312,46 +306,29 @@ impl Timeline {

            let key = rel_block_to_key(*tag, *blknum);

-            if ctx.has_perf_span() {
-                perf_instrument = true;
-            }
-
            let key_slots = keys_slots.entry(key).or_default();
            key_slots.push((response_slot_idx, ctx));
+
+            let acc = req_keyspaces.entry(lsn).or_default();
+            acc.add_key(key);
        }

-        let keyspace = {
-            // add_key requires monotonicity
-            let mut acc = KeySpaceAccum::new();
-            for key in keys_slots
-                .keys()
-                // in fact it requires strong monotonicity
-                .dedup()
-            {
-                acc.add_key(*key);
-            }
-            acc.to_keyspace()
-        };
-
-        let ctx = match perf_instrument {
-            true => RequestContextBuilder::from(ctx)
-                .root_perf_span(|| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        "GET_VECTORED",
-                        tenant_id = %self.tenant_shard_id.tenant_id,
-                        timeline_id = %self.timeline_id,
-                        lsn = %effective_lsn,
-                        shard = %self.tenant_shard_id.shard_slug(),
-                    )
-                })
-                .attached_child(),
-            false => ctx.attached_child(),
-        };
+        let query: Vec<(Lsn, KeySpace)> = req_keyspaces
+            .into_iter()
+            .map(|(lsn, acc)| (lsn, acc.to_keyspace()))
+            .collect();

+        let query = VersionedKeySpaceQuery::scattered(query);
        let res = self
-            .get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
-            .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
+            .get_vectored(query, io_concurrency, ctx)
+            .maybe_perf_instrument(ctx, |current_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: current_perf_span,
+                    "GET_BATCH",
+                    batch_size = %page_count,
+                )
+            })
            .await;

        match res {
@@ -381,12 +358,12 @@ impl Timeline {
                        // There is no standardized way to express that the batched span followed from N request spans.
                        // So, abuse the system and mark the request contexts as follows_from the batch span, so we get
                        // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
-                        req_ctx.perf_follows_from(&ctx);
+                        req_ctx.perf_follows_from(ctx);
                        slots_filled += 1;
                    }

                    result_slots[first_slot].write(res);
-                    first_req_ctx.perf_follows_from(&ctx);
+                    first_req_ctx.perf_follows_from(ctx);
                    slots_filled += 1;
                }
            }
@@ -425,7 +402,7 @@ impl Timeline {
                        }
                    };

-                    req_ctx.perf_follows_from(&ctx);
+                    req_ctx.perf_follows_from(ctx);
                    result_slots[*slot].write(err);
                }

@@ -664,8 +641,9 @@ impl Timeline {

        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
        for batch in batches.parts {
+            let query = VersionedKeySpaceQuery::uniform(batch, lsn);
            let blocks = self
-                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .get_vectored(query, io_concurrency.clone(), ctx)
                .await?;

            for (_key, block) in blocks {
@@ -902,8 +880,9 @@ impl Timeline {
            );

            for batch in batches.parts.into_iter().rev() {
+                let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn);
                let blocks = self
-                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
+                    .get_vectored(query, io_concurrency.clone(), ctx)
                    .await?;

                for (_key, clog_page) in blocks.into_iter().rev() {
@@ -1051,19 +1030,16 @@ impl Timeline {
            )
            .await?;
        let mut result = HashMap::new();
-        let mut sz = 0;
+
        for (_, v) in kv {
            let v = v?;
            let v = aux_file::decode_file_value_bytes(&v)
                .context("value decode")
                .map_err(PageReconstructError::Other)?;
            for (fname, content) in v {
-                sz += fname.len();
-                sz += content.len();
                result.insert(fname, content);
            }
        }
-        self.aux_file_size_estimator.on_initial(sz);
        Ok(result)
    }

@@ -1334,12 +1310,12 @@ impl Timeline {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
-                RELSIZE_CACHE_HITS.inc();
+                
                return Some(*nblocks);
            }
-            RELSIZE_CACHE_MISSES_OLD.inc();
+            
        }
-        RELSIZE_CACHE_MISSES.inc();
+       
        None
    }

@@ -1364,25 +1340,21 @@ impl Timeline {
            }
            hash_map::Entry::Vacant(entry) => {
                entry.insert((lsn, nblocks));
-                RELSIZE_CACHE_ENTRIES.inc();
+               
            }
        }
    }

    /// Store cached relation size
-    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
-        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
-            RELSIZE_CACHE_ENTRIES.inc();
-        }
+    pub fn set_cached_rel_size(&self, _tag: RelTag, _lsn: Lsn, _nblocks: BlockNumber) {
+       
+        
    }

    /// Remove cached relation size
-    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
-        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        if rel_size_cache.map.remove(tag).is_some() {
-            RELSIZE_CACHE_ENTRIES.dec();
-        }
+    pub fn remove_cached_rel_size(&self, _tag: &RelTag) {
+        
+        
    }
 }

@@ -1457,29 +1429,11 @@ impl DatadirModification<'_> {
            .is_some_and(|b| b.has_data())
    }

-    /// Returns statistics about the currently pending modifications.
-    pub(crate) fn stats(&self) -> DatadirModificationStats {
-        let mut stats = DatadirModificationStats::default();
-        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
-            match value {
-                Value::Image(_) => stats.metadata_images += 1,
-                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
-                Value::WalRecord(_) => stats.metadata_deltas += 1,
-            }
-        }
-        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
-            match valuemeta {
-                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
-                ValueMeta::Serialized(_) => stats.data_deltas += 1,
-                ValueMeta::Observed(_) => {}
-            }
-        }
-        stats
-    }
+    

    /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
+        ensure_walingest!(
            lsn >= self.lsn,
            "setting an older lsn {} than {} is not allowed",
            lsn,
@@ -1578,7 +1532,7 @@ impl DatadirModification<'_> {
        &mut self,
        rel: RelTag,
        ctx: &RequestContext,
-    ) -> Result<u32, PageReconstructError> {
+    ) -> Result<u32, WalIngestError> {
        // Get current size and put rel creation if rel doesn't exist
        //
        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
@@ -1593,14 +1547,13 @@ impl DatadirModification<'_> {
            .await?
        {
            // create it with 0 size initially, the logic below will extend it
-            self.put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
+            self.put_rel_creation(rel, 0, ctx).await?;
            Ok(0)
        } else {
-            self.tline
+            Ok(self
+                .tline
                .get_rel_size(rel, Version::Modified(self), ctx)
-                .await
+                .await?)
        }
    }

@@ -1637,11 +1590,14 @@ impl DatadirModification<'_> {
        // TODO(vlad): remove this argument and replace the shard check with is_key_local
        shard: &ShardIdentity,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let mut gaps_at_lsns = Vec::default();

        for meta in batch.metadata.iter() {
-            let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
+            let key = Key::from_compact(meta.key());
+            let (rel, blkno) = key
+                .to_rel_block()
+                .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
            let new_nblocks = blkno + 1;

            let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
@@ -1683,8 +1639,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        blknum: BlockNumber,
        rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
        Ok(())
    }
@@ -1696,7 +1652,7 @@ impl DatadirModification<'_> {
        segno: u32,
        blknum: BlockNumber,
        rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        if !self.tline.tenant_shard_id.is_shard_zero() {
            return Ok(());
        }
@@ -1714,14 +1670,11 @@ impl DatadirModification<'_> {
        rel: RelTag,
        blknum: BlockNumber,
        img: Bytes,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        let key = rel_block_to_key(rel, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }
        self.put(rel_block_to_key(rel, blknum), Value::Image(img));
        Ok(())
@@ -1733,15 +1686,12 @@ impl DatadirModification<'_> {
        segno: u32,
        blknum: BlockNumber,
        img: Bytes,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        let key = slru_block_to_key(kind, segno, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }
        self.put(key, Value::Image(img));
        Ok(())
@@ -1751,15 +1701,11 @@ impl DatadirModification<'_> {
        &mut self,
        rel: RelTag,
        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        let key = rel_block_to_key(rel, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }

        let batch = self
@@ -1776,15 +1722,11 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());
        let key = slru_block_to_key(kind, segno, blknum);
        if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
        }

        let batch = self
@@ -1832,8 +1774,10 @@ impl DatadirModification<'_> {
        dbnode: Oid,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;

        // Add it to the directory (if it doesn't exist already)
        let buf = self.get(DBDIR_KEY, ctx).await?;
@@ -1874,13 +1818,13 @@ impl DatadirModification<'_> {
        xid: u64,
        img: Bytes,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Add it to the directory entry
        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
        let newdirbuf = if self.tline.pg_version >= 17 {
            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
            if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
            }
            self.pending_directory_entries.push((
                DirectoryKind::TwoPhase,
@@ -1891,7 +1835,7 @@ impl DatadirModification<'_> {
            let xid = xid as u32;
            let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
            if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
            }
            self.pending_directory_entries.push((
                DirectoryKind::TwoPhase,
@@ -1909,22 +1853,22 @@ impl DatadirModification<'_> {
        &mut self,
        origin_id: RepOriginId,
        origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let key = repl_origin_key(origin_id);
        self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
        Ok(())
    }

-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
+    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
        self.set_replorigin(origin_id, Lsn::INVALID).await
    }

-    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
        self.put(CONTROLFILE_KEY, Value::Image(img));
        Ok(())
    }

-    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
        self.put(CHECKPOINT_KEY, Value::Image(img));
        Ok(())
    }
@@ -1934,7 +1878,7 @@ impl DatadirModification<'_> {
        spcnode: Oid,
        dbnode: Oid,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let total_blocks = self
            .tline
            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
@@ -1973,20 +1917,21 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> Result<(), RelationError> {
+    ) -> Result<(), WalIngestError> {
        if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
+            Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
+                "invalid relnode"
+            )))?;
        }
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
-            .context("deserialize db")?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;

        let dbdir_exists =
            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                // Didn't exist. Update dbdir
                e.insert(false);
-                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                let buf = DbDirectory::ser(&dbdir)?;
                self.pending_directory_entries.push((
                    DirectoryKind::Db,
                    MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
@@ -2003,27 +1948,25 @@ impl DatadirModification<'_> {
            RelDirectory::default()
        } else {
            // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
        };

-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;

        if v2_enabled {
            if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            let sparse_rel_dir_key =
                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
            // check if the rel_dir_key exists in v2
-            let val = self
-                .sparse_get(sparse_rel_dir_key, ctx)
-                .await
-                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
            let val = RelDirExists::decode_option(val)
-                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
            if val == RelDirExists::Exists {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            self.put(
                sparse_rel_dir_key,
@@ -2039,9 +1982,7 @@ impl DatadirModification<'_> {
                // will be key not found errors if we don't create an empty one for rel_size_v2.
                self.put(
                    rel_dir_key,
-                    Value::Image(Bytes::from(
-                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
-                    )),
+                    Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
                );
            }
            self.pending_directory_entries
@@ -2049,7 +1990,7 @@ impl DatadirModification<'_> {
        } else {
            // Add the new relation to the rel directory entry, and write it back
            if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
            }
            if !dbdir_exists {
                self.pending_directory_entries
@@ -2059,9 +2000,7 @@ impl DatadirModification<'_> {
                .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
            self.put(
                rel_dir_key,
-                Value::Image(Bytes::from(
-                    RelDirectory::ser(&rel_dir).context("serialize")?,
-                )),
+                Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
            );
        }

@@ -2086,8 +2025,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
        if self
            .tline
            .get_rel_exists(rel, Version::Modified(self), ctx)
@@ -2117,8 +2056,8 @@ impl DatadirModification<'_> {
        rel: RelTag,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);

        // Put size
        let size_key = rel_size_to_key(rel);
@@ -2142,8 +2081,10 @@ impl DatadirModification<'_> {
        &mut self,
        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
        for ((spc_node, db_node), rel_tags) in drop_relations {
            let dir_key = rel_dir_to_key(spc_node, db_node);
            let buf = self.get(dir_key, ctx).await?;
@@ -2163,7 +2104,7 @@ impl DatadirModification<'_> {
                    let key =
                        rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
                    let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
-                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                        .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
                    if val == RelDirExists::Exists {
                        self.pending_directory_entries
                            .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
@@ -2206,7 +2147,7 @@ impl DatadirModification<'_> {
        segno: u32,
        nblocks: BlockNumber,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        // Add it to the directory entry
@@ -2215,7 +2156,7 @@ impl DatadirModification<'_> {
        let mut dir = SlruSegmentDirectory::des(&buf)?;

        if !dir.segments.insert(segno) {
-            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
+            Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
        }
        self.pending_directory_entries.push((
            DirectoryKind::SlruSegment(kind),
@@ -2242,7 +2183,7 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        nblocks: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        assert!(self.tline.tenant_shard_id.is_shard_zero());

        // Put size
@@ -2258,7 +2199,7 @@ impl DatadirModification<'_> {
        kind: SlruKind,
        segno: u32,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Remove it from the directory entry
        let dir_key = slru_dir_to_key(kind);
        let buf = self.get(dir_key, ctx).await?;
@@ -2283,7 +2224,7 @@ impl DatadirModification<'_> {
    }

    /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
        // TODO
        Ok(())
    }
@@ -2293,7 +2234,7 @@ impl DatadirModification<'_> {
        &mut self,
        xid: u64,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        // Remove it from the directory entry
        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
        let newdirbuf = if self.tline.pg_version >= 17 {
@@ -2308,7 +2249,8 @@ impl DatadirModification<'_> {
            ));
            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
        } else {
-            let xid: u32 = u32::try_from(xid)?;
+            let xid: u32 = u32::try_from(xid)
+                .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
            let mut dir = TwoPhaseDirectory::des(&buf)?;

            if !dir.xids.remove(&xid) {
@@ -2333,7 +2275,7 @@ impl DatadirModification<'_> {
        path: &str,
        content: &[u8],
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
        let key = aux_file::encode_aux_file_key(path);
        // retrieve the key from the engine
        let old_val = match self.get(key, ctx).await {
@@ -2342,7 +2284,7 @@ impl DatadirModification<'_> {
            Err(e) => return Err(e.into()),
        };
        let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-            aux_file::decode_file_value(old_val)?
+            aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
        } else {
            Vec::new()
        };
@@ -2362,20 +2304,15 @@ impl DatadirModification<'_> {
        }
        let mut new_files = other_files;
        match (modifying_file, content.is_empty()) {
-            (Some(old_content), false) => {
-                self.tline
-                    .aux_file_size_estimator
-                    .on_update(old_content.len(), content.len());
+            (Some(_old_content), false) => {
+                
                new_files.push((path, content));
            }
-            (Some(old_content), true) => {
-                self.tline
-                    .aux_file_size_estimator
-                    .on_remove(old_content.len());
+            (Some(_old_content), true) => {
+               
                // not adding the file key to the final `new_files` vec.
            }
            (None, false) => {
-                self.tline.aux_file_size_estimator.on_add(content.len());
                new_files.push((path, content));
            }
            // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.
@@ -2387,7 +2324,8 @@ impl DatadirModification<'_> {
            }
            (None, true) => warn!("removing non-existing aux file: {}", path),
        }
-        let new_val = aux_file::encode_file_value(&new_files)?;
+        let new_val = aux_file::encode_file_value(&new_files)
+            .map_err(WalIngestErrorKind::EncodeAuxFileError)?;
        self.put(key, Value::Image(new_val.into()));

        Ok(())
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -22,6 +22,7 @@ use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tokio_util::sync::CancellationToken;
 use tracing::warn;

 use crate::context::RequestContext;
@@ -36,6 +37,63 @@ pub struct CompressionInfo {
    pub compressed_size: Option<usize>,
 }

+/// A blob header, with header+data length and compression info.
+///
+/// TODO: use this more widely, and add an encode() method too.
+/// TODO: document the header format.
+#[derive(Clone, Copy, Default)]
+pub struct Header {
+    pub header_len: usize,
+    pub data_len: usize,
+    pub compression_bits: u8,
+}
+
+impl Header {
+    /// Decodes a header from a byte slice.
+    pub fn decode(bytes: &[u8]) -> Result<Self, std::io::Error> {
+        let Some(&first_header_byte) = bytes.first() else {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                "zero-length blob header",
+            ));
+        };
+
+        // If the first bit is 0, this is just a 1-byte length prefix up to 128 bytes.
+        if first_header_byte < 0x80 {
+            return Ok(Self {
+                header_len: 1, // by definition
+                data_len: first_header_byte as usize,
+                compression_bits: BYTE_UNCOMPRESSED,
+            });
+        }
+
+        // Otherwise, this is a 4-byte header containing compression information and length.
+        const HEADER_LEN: usize = 4;
+        let mut header_buf: [u8; HEADER_LEN] = bytes[0..HEADER_LEN].try_into().map_err(|_| {
+            std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("blob header too short: {bytes:?}"),
+            )
+        })?;
+
+        // TODO: verify the compression bits and convert to an enum.
+        let compression_bits = header_buf[0] & LEN_COMPRESSION_BIT_MASK;
+        header_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
+        let data_len = u32::from_be_bytes(header_buf) as usize;
+
+        Ok(Self {
+            header_len: HEADER_LEN,
+            data_len,
+            compression_bits,
+        })
+    }
+
+    /// Returns the total header+data length.
+    pub fn total_len(&self) -> usize {
+        self.header_len + self.data_len
+    }
+}
+
 impl BlockCursor<'_> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(
@@ -169,7 +227,13 @@ pub struct BlobWriter<const BUFFERED: bool> {
 }

 impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
-    pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
+    pub fn new(
+        inner: VirtualFile,
+        start_offset: u64,
+        _gate: &utils::sync::gate::Gate,
+        _cancel: CancellationToken,
+        _ctx: &RequestContext,
+    ) -> Self {
        Self {
            inner,
            offset: start_offset,
@@ -432,12 +496,14 @@ pub(crate) mod tests {
    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
+        let gate = utils::sync::gate::Gate::default();
+        let cancel = CancellationToken::new();

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
-            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
+            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
            for blob in blobs.iter() {
                let (_, res) = if compression {
                    let res = wtr
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -714,7 +714,7 @@ impl LayerMap {
        true
    }

-    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
+    pub fn iter_historic_layers(&self) -> impl ExactSizeIterator<Item = Arc<PersistentLayerDesc>> {
        self.historic.iter()
    }

--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -504,7 +504,7 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
    }

    /// Iterate all the layers
-    pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
+    pub fn iter(&self) -> impl ExactSizeIterator<Item = Value> {
        // NOTE we can actually perform this without rebuilding,
        //      but it's not necessary for now.
        if !self.buffer.is_empty() {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,7 +44,7 @@ use crate::controller_upcall_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
-use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
+use crate::metrics::TENANT_MANAGER as METRICS;
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
@@ -519,7 +519,7 @@ pub async fn init_tenant_mgr(
        tenant_configs.len(),
        conf.concurrent_tenant_warmup.initial_permits()
    );
-    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);
+

    // Accumulate futures for writing tenant configs, so that we can execute in parallel
    let mut config_write_futs = Vec::new();
@@ -2177,9 +2177,7 @@ impl TenantManager {
                        // we would use if not doing any eviction.
                        progress.bytes_total
                    } else {
-                        // In the absence of heatmap info, assume that the secondary location simply
-                        // needs as much space as it is currently using.
-                        secondary.resident_size_metric.get()
+                        42
                    }
                }
            }
@@ -2530,7 +2528,7 @@ impl SlotGuard {
                Ok(())
            }
            None => {
-                METRICS.unexpected_errors.inc();
+              
                error!(
                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during tenant upsert, this is a bug."
@@ -2540,7 +2538,7 @@ impl SlotGuard {
                ))
            }
            Some(slot) => {
-                METRICS.unexpected_errors.inc();
+               
                error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during upsert, this is a bug.  Contents: {:?}", slot);
                Err(TenantSlotUpsertError::InternalError(
                    "Unexpected contents of TenantSlot".into(),
@@ -2621,7 +2619,7 @@ impl Drop for SlotGuard {
        match m.entry(self.tenant_shard_id) {
            Entry::Occupied(mut entry) => {
                if !matches!(entry.get(), TenantSlot::InProgress(_)) {
-                    METRICS.unexpected_errors.inc();
+                    
                    error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during drop, this is a bug.  Contents: {:?}", entry.get());
                }

@@ -2636,7 +2634,7 @@ impl Drop for SlotGuard {
                }
            }
            Entry::Vacant(_) => {
-                METRICS.unexpected_errors.inc();
+                
                error!(
                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during SlotGuard drop, this is a bug."
@@ -2696,7 +2694,7 @@ fn tenant_map_acquire_slot_impl(
    mode: TenantSlotAcquireMode,
 ) -> Result<SlotGuard, TenantSlotError> {
    use TenantSlotAcquireMode::*;
-    METRICS.tenant_slot_writes.inc();
+  

    let mut locked = tenants.write().unwrap();
    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -223,9 +223,8 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
-    MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
-    RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
-    RemoteTimelineClientMetricsCallTrackSize,
+    MeasureRemoteOp, 
+    RemoteOpFileKind, RemoteOpKind, 
 };
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token};
 use crate::tenant::metadata::TimelineMetadata;
@@ -357,8 +356,6 @@ pub(crate) struct RemoteTimelineClient {

    upload_queue: Mutex<UploadQueue>,

-    pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
-
    storage_impl: GenericRemoteStorage,

    deletion_queue_client: DeletionQueueClient,
@@ -405,10 +402,6 @@ impl RemoteTimelineClient {
            storage_impl: remote_storage,
            deletion_queue_client,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
-            metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                &tenant_shard_id,
-                &timeline_id,
-            )),
            config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
            cancel: CancellationToken::new(),
        }
@@ -597,21 +590,13 @@ impl RemoteTimelineClient {
            .map_err(|_| UploadQueueNotReadyError)
    }

-    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
-        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
-            current_remote_index_part
-                .layer_metadata
-                .values()
-                .map(|ilmd| ilmd.file_size)
-                .sum()
-        } else {
-            0
-        };
-        self.metrics.remote_physical_size_gauge.set(size);
+    fn update_remote_physical_size_gauge(&self, _current_remote_index_part: Option<&IndexPart>) {
+
+        
    }

    pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_gauge.get()
+  0
    }

    //
@@ -626,13 +611,6 @@ impl RemoteTimelineClient {
        &self,
        cancel: &CancellationToken,
    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
-        let _unfinished_gauge_guard = self.metrics.call_begin(
-            &RemoteOpFileKind::Index,
-            &RemoteOpKind::Download,
-            crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
-                reason: "no need for a downloads gauge",
-            },
-        );

        let (index_part, index_generation, index_last_modified) = download::download_index_part(
            &self.storage_impl,
@@ -645,7 +623,7 @@ impl RemoteTimelineClient {
            Option::<TaskKind>::None,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
-            Arc::clone(&self.metrics),
+        
        )
        .await?;

@@ -720,13 +698,7 @@ impl RemoteTimelineClient {
        ctx: &RequestContext,
    ) -> Result<u64, DownloadError> {
        let downloaded_size = {
-            let _unfinished_gauge_guard = self.metrics.call_begin(
-                &RemoteOpFileKind::Layer,
-                &RemoteOpKind::Download,
-                crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
-                    reason: "no need for a downloads gauge",
-                },
-            );
+        
            download::download_layer_file(
                self.conf,
                &self.storage_impl,
@@ -743,13 +715,11 @@ impl RemoteTimelineClient {
                Some(ctx.task_kind()),
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
-                Arc::clone(&self.metrics),
+          
            )
            .await?
        };

-        REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
-        REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);

        Ok(downloaded_size)
    }
@@ -1027,7 +997,6 @@ impl RemoteTimelineClient {
        let op = UploadOp::UploadMetadata {
            uploaded: Box::new(index_part.clone()),
        };
-        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

@@ -1265,7 +1234,6 @@ impl RemoteTimelineClient {
        );

        let op = UploadOp::UploadLayer(layer, metadata, None);
-        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -1442,7 +1410,6 @@ impl RemoteTimelineClient {
        let op = UploadOp::Delete(Delete {
            layers: with_metadata,
        });
-        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -2180,7 +2147,7 @@ impl RemoteTimelineClient {
                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
-                        Arc::clone(&self.metrics),
+                      
                    )
                    .await
                }
@@ -2197,7 +2164,7 @@ impl RemoteTimelineClient {
                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
-                        Arc::clone(&self.metrics),
+                    
                    )
                    .await;
                    if res.is_ok() {
@@ -2343,10 +2310,7 @@ impl RemoteTimelineClient {
                    upload_queue.clean.1 = Some(task.task_id);

                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
-                    self.metrics
-                        .projected_remote_consistent_lsn_gauge
-                        .set(lsn.0);
-
+                   
                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
                        upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -2387,64 +2351,6 @@ impl RemoteTimelineClient {
                .await;
        }

-        self.metric_end(&task.op);
-        for coalesced_op in &task.coalesced_ops {
-            self.metric_end(coalesced_op);
-        }
-    }
-
-    fn metric_impl(
-        &self,
-        op: &UploadOp,
-    ) -> Option<(
-        RemoteOpFileKind,
-        RemoteOpKind,
-        RemoteTimelineClientMetricsCallTrackSize,
-    )> {
-        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
-        let res = match op {
-            UploadOp::UploadLayer(_, m, _) => (
-                RemoteOpFileKind::Layer,
-                RemoteOpKind::Upload,
-                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
-            ),
-            UploadOp::UploadMetadata { .. } => (
-                RemoteOpFileKind::Index,
-                RemoteOpKind::Upload,
-                DontTrackSize {
-                    reason: "metadata uploads are tiny",
-                },
-            ),
-            UploadOp::Delete(_delete) => (
-                RemoteOpFileKind::Layer,
-                RemoteOpKind::Delete,
-                DontTrackSize {
-                    reason: "should we track deletes? positive or negative sign?",
-                },
-            ),
-            UploadOp::Barrier(..) | UploadOp::Shutdown => {
-                // we do not account these
-                return None;
-            }
-        };
-        Some(res)
-    }
-
-    fn metric_begin(&self, op: &UploadOp) {
-        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
-            Some(x) => x,
-            None => return,
-        };
-        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
-        guard.will_decrement_manually(); // in metric_end(), see right below
-    }
-
-    fn metric_end(&self, op: &UploadOp) {
-        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
-            Some(x) => x,
-            None => return,
-        };
-        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
    }

    /// Close the upload queue for new operations and cancel queued operations.
@@ -2524,7 +2430,6 @@ impl RemoteTimelineClient {

                // Tear down queued ops
                for op in qi.queued_operations.into_iter() {
-                    self.metric_end(&op);
                    // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                    // which is exactly what we want to happen.
                    drop(op);
@@ -2834,10 +2739,6 @@ mod tests {
                storage_impl: self.harness.remote_storage.clone(),
                deletion_queue_client: self.harness.deletion_queue.new_client(),
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
-                metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                    &self.harness.tenant_shard_id,
-                    &TIMELINE_ID,
-                )),
                config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
                cancel: CancellationToken::new(),
            })
@@ -3064,99 +2965,7 @@ mod tests {
        );
    }

-    #[tokio::test]
-    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
-        // Setup
-
-        let TestSetup {
-            harness,
-            tenant: _tenant,
-            timeline,
-            ..
-        } = TestSetup::new("metrics").await.unwrap();
-        let client = &timeline.remote_client;
-
-        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let local_path = local_layer_path(
-            harness.conf,
-            &timeline.tenant_shard_id,
-            &timeline.timeline_id,
-            &layer_file_name_1,
-            &harness.generation,
-        );
-        let content_1 = dummy_contents("foo");
-        std::fs::write(&local_path, &content_1).unwrap();
-
-        let layer_file_1 = Layer::for_resident(
-            harness.conf,
-            &timeline,
-            local_path,
-            layer_file_name_1.clone(),
-            LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
-        );
-
-        #[derive(Debug, PartialEq, Clone, Copy)]
-        struct BytesStartedFinished {
-            started: Option<usize>,
-            finished: Option<usize>,
-        }
-        impl std::ops::Add for BytesStartedFinished {
-            type Output = Self;
-            fn add(self, rhs: Self) -> Self::Output {
-                Self {
-                    started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
-                    finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
-                }
-            }
-        }
-        let get_bytes_started_stopped = || {
-            let started = client
-                .metrics
-                .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
-                .map(|v| v.try_into().unwrap());
-            let stopped = client
-                .metrics
-                .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
-                .map(|v| v.try_into().unwrap());
-            BytesStartedFinished {
-                started,
-                finished: stopped,
-            }
-        };
-
-        // Test
-        tracing::info!("now doing actual test");
-
-        let actual_a = get_bytes_started_stopped();
-
-        client
-            .schedule_layer_file_upload(layer_file_1.clone())
-            .unwrap();
-
-        let actual_b = get_bytes_started_stopped();
-
-        client.wait_completion().await.unwrap();
-
-        let actual_c = get_bytes_started_stopped();
-
-        // Validate
-
-        let expected_b = actual_a
-            + BytesStartedFinished {
-                started: Some(content_1.len()),
-                // assert that the _finished metric is created eagerly so that subtractions work on first sample
-                finished: Some(0),
-            };
-        assert_eq!(actual_b, expected_b);
-
-        let expected_c = actual_a
-            + BytesStartedFinished {
-                started: Some(content_1.len()),
-                finished: Some(content_1.len()),
-            };
-        assert_eq!(actual_c, expected_c);
-    }
-
+    
    async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
        // An empty IndexPart, just sufficient to ensure deserialization will succeed
        let example_index_part = IndexPart::example();
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -6,7 +6,6 @@ mod scheduler;
 use std::sync::Arc;
 use std::time::SystemTime;

-use metrics::UIntGauge;
 use pageserver_api::models;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use remote_storage::GenericRemoteStorage;
@@ -26,7 +25,6 @@ use super::span::debug_assert_current_span_has_tenant_id;
 use super::storage_layer::LayerName;
 use crate::context::RequestContext;
 use crate::disk_usage_eviction_task::DiskUsageEvictionInfo;
-use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};

 enum DownloadCommand {
@@ -109,12 +107,7 @@ pub(crate) struct SecondaryTenant {

    // Public state indicating overall progress of downloads relative to the last heatmap seen
    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
-
-    // Sum of layer sizes on local disk
-    pub(super) resident_size_metric: UIntGauge,
-
-    // Sum of layer sizes in the most recently downloaded heatmap
-    pub(super) heatmap_total_size_metric: UIntGauge,
+   
 }

 impl SecondaryTenant {
@@ -124,16 +117,8 @@ impl SecondaryTenant {
        tenant_conf: pageserver_api::models::TenantConfig,
        config: &SecondaryLocationConfig,
    ) -> Arc<Self> {
-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let shard_id = format!("{}", tenant_shard_id.shard_slug());
-        let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id])
-            .unwrap();
-
-        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id])
-            .unwrap();
-
+    
+    
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -150,14 +135,10 @@ impl SecondaryTenant {

            progress: std::sync::Mutex::default(),

-            resident_size_metric,
-            heatmap_total_size_metric,
        })
    }

-    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
-        self.tenant_shard_id
-    }
+    

    pub(crate) async fn shutdown(&self) {
        self.cancel.cancel();
@@ -169,15 +150,10 @@ impl SecondaryTenant {

        // Metrics are subtracted from and/or removed eagerly.
        // Deletions are done in the background via [`BackgroundPurges::spawn`].
-        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
-        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
-        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
-        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
-
        self.detail
            .lock()
            .unwrap()
-            .drain_timelines(&self.tenant_shard_id, &self.resident_size_metric);
+            .drain_timelines(&self.tenant_shard_id);
    }

    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
@@ -255,7 +231,7 @@ impl SecondaryTenant {
            // of the cache.
            let mut detail = this.detail.lock().unwrap();
            if let Some(removed) =
-                detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
+                detail.evict_layer(name, &timeline_id, now)
            {
                // We might race with removal of the same layer during downloads, so finding the layer we
                // were trying to remove is optional.  Only issue the disk I/O to remove it if we found it.
@@ -269,10 +245,9 @@ impl SecondaryTenant {
    /// Exhaustive check that incrementally updated metrics match the actual state.
    #[cfg(feature = "testing")]
    fn validate_metrics(&self) {
-        let detail = self.detail.lock().unwrap();
-        let resident_size = detail.total_resident_size();
+        

-        assert_eq!(resident_size, self.resident_size_metric.get());
+        
    }

    #[cfg(not(feature = "testing"))]
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -4,11 +4,9 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};

-use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation};
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
-use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage};
@@ -33,7 +31,6 @@ use crate::context::RequestContext;
 use crate::disk_usage_eviction_task::{
    DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32,
 };
-use crate::metrics::SECONDARY_MODE;
 use crate::tenant::config::SecondaryLocationConfig;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::ephemeral_file::is_ephemeral_file;
@@ -120,9 +117,6 @@ impl OnDiskState {
            .fatal_err("Deleting secondary layer")
    }

-    pub(crate) fn file_size(&self) -> u64 {
-        self.metadata.file_size
-    }
 }

 pub(super) struct SecondaryDetailTimeline {
@@ -175,13 +169,9 @@ impl SecondaryDetailTimeline {
    pub(super) fn remove_layer(
        &mut self,
        name: &LayerName,
-        resident_metric: &UIntGauge,
    ) -> Option<OnDiskState> {
-        let removed = self.on_disk_layers.remove(name);
-        if let Some(removed) = &removed {
-            resident_metric.sub(removed.file_size());
-        }
-        removed
+        self.on_disk_layers.remove(name)
+        
    }

    /// `local_path`
@@ -191,7 +181,6 @@ impl SecondaryDetailTimeline {
        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        touched: &HeatMapLayer,
-        resident_metric: &UIntGauge,
        local_path: F,
    ) where
        F: FnOnce() -> Utf8PathBuf,
@@ -211,7 +200,6 @@ impl SecondaryDetailTimeline {
                    touched.access_time,
                    local_path(),
                ));
-                resident_metric.add(touched.metadata.file_size);
            }
        }
    }
@@ -267,28 +255,16 @@ impl SecondaryDetail {
        }
    }

-    #[cfg(feature = "testing")]
-    pub(crate) fn total_resident_size(&self) -> u64 {
-        self.timelines
-            .values()
-            .map(|tl| {
-                tl.on_disk_layers
-                    .values()
-                    .map(|v| v.metadata.file_size)
-                    .sum::<u64>()
-            })
-            .sum::<u64>()
-    }

    pub(super) fn evict_layer(
        &mut self,
        name: LayerName,
        timeline_id: &TimelineId,
        now: SystemTime,
-        resident_metric: &UIntGauge,
+
    ) -> Option<OnDiskState> {
        let timeline = self.timelines.get_mut(timeline_id)?;
-        let removed = timeline.remove_layer(&name, resident_metric);
+        let removed = timeline.remove_layer(&name);
        if removed.is_some() {
            timeline.evicted_at.insert(name, now);
        }
@@ -297,52 +273,21 @@ impl SecondaryDetail {

    pub(super) fn remove_timeline(
        &mut self,
-        tenant_shard_id: &TenantShardId,
+        _tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
-        resident_metric: &UIntGauge,
    ) {
-        let removed = self.timelines.remove(timeline_id);
-        if let Some(removed) = removed {
-            Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric);
-        }
+        self.timelines.remove(timeline_id);
+        
    }

    pub(super) fn drain_timelines(
        &mut self,
-        tenant_shard_id: &TenantShardId,
-        resident_metric: &UIntGauge,
+        _tenant_shard_id: &TenantShardId,
+
    ) {
-        for (timeline_id, removed) in self.timelines.drain() {
-            Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric);
-        }
+        
    }

-    fn clear_timeline_metrics(
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-        detail: SecondaryDetailTimeline,
-        resident_metric: &UIntGauge,
-    ) {
-        resident_metric.sub(
-            detail
-                .on_disk_layers
-                .values()
-                .map(|l| l.metadata.file_size)
-                .sum(),
-        );
-
-        let shard_id = format!("{}", tenant_shard_id.shard_slug());
-        let tenant_id = tenant_shard_id.tenant_id.to_string();
-        let timeline_id = timeline_id.to_string();
-        for op in StorageIoSizeOperation::VARIANTS {
-            let _ = STORAGE_IO_SIZE.remove_label_values(&[
-                op,
-                tenant_id.as_str(),
-                shard_id.as_str(),
-                timeline_id.as_str(),
-            ]);
-        }
-    }

    /// Additionally returns the total number of layers, used for more stable relative access time
    /// based eviction.
@@ -797,7 +742,6 @@ impl<'a> TenantDownloader<'a> {
                        tenant_shard_id,
                        last_heatmap,
                        timeline,
-                        &self.secondary_state.resident_size_metric,
                        ctx,
                    )
                    .await;
@@ -920,11 +864,7 @@ impl<'a> TenantDownloader<'a> {
            bytes_downloaded: 0,
        };

-        // Also expose heatmap bytes_total as a metric
-        self.secondary_state
-            .heatmap_total_size_metric
-            .set(heatmap_stats.bytes);
-
+       
        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
        let mut delete_layers = Vec::new();
        let mut delete_timelines = Vec::new();
@@ -991,7 +931,6 @@ impl<'a> TenantDownloader<'a> {
                detail.remove_timeline(
                    self.secondary_state.get_tenant_shard_id(),
                    delete_timeline,
-                    &self.secondary_state.resident_size_metric,
                );
            }
        }
@@ -1010,7 +949,7 @@ impl<'a> TenantDownloader<'a> {
            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                continue;
            };
-            timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
+            timeline_state.remove_layer(&layer_name);
        }

        for timeline_id in delete_timelines {
@@ -1077,7 +1016,7 @@ impl<'a> TenantDownloader<'a> {
        .await
        .ok_or_else(|| UpdateError::Cancelled)
        .and_then(|x| x)
-        .inspect(|_| SECONDARY_MODE.download_heatmap.inc())
+        .inspect(|_|{} )
    }

    /// Download heatmap layers that are not present on local disk, or update their
@@ -1252,7 +1191,6 @@ impl<'a> TenantDownloader<'a> {
                    tenant_shard_id,
                    &timeline_id,
                    &t,
-                    &self.secondary_state.resident_size_metric,
                    || {
                        local_layer_path(
                            self.conf,
@@ -1364,7 +1302,6 @@ impl<'a> TenantDownloader<'a> {
            progress.layers_downloaded += 1;
        }

-        SECONDARY_MODE.download_layer.inc();

        Ok(Some(layer))
    }
@@ -1376,7 +1313,6 @@ async fn init_timeline_state(
    tenant_shard_id: &TenantShardId,
    last_heatmap: Option<&HeatMapTimeline>,
    heatmap: &HeatMapTimeline,
-    resident_metric: &UIntGauge,
    ctx: &RequestContext,
 ) -> SecondaryDetailTimeline {
    let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id);
@@ -1480,7 +1416,6 @@ async fn init_timeline_state(
                                tenant_shard_id,
                                &heatmap.timeline_id,
                                remote_meta,
-                                resident_metric,
                                || file_path,
                            );
                        }
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -20,7 +20,6 @@ use super::scheduler::{
 };
 use super::{CommandRequest, SecondaryTenantError, UploadCommand};
 use crate::TEMP_FILE_SUFFIX;
-use crate::metrics::SECONDARY_MODE;
 use crate::tenant::Tenant;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::mgr::{GetTenantError, TenantManager};
@@ -221,14 +220,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            // Guard for the barrier in [`WriteInProgress`]
            let _completion = completion;

-            let started_at = Instant::now();
+            
            let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
                Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
-                    let duration = Instant::now().duration_since(started_at);
-                    SECONDARY_MODE
-                        .upload_heatmap_duration
-                        .observe(duration.as_secs_f64());
-                    SECONDARY_MODE.upload_heatmap.inc();
+                    
                    Some(uploaded)
                }
                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
@@ -237,11 +232,8 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                        "Failed to upload heatmap for tenant {}: {e:#}",
                        tenant.get_tenant_shard_id(),
                    );
-                    let duration = Instant::now().duration_since(started_at);
-                    SECONDARY_MODE
-                        .upload_heatmap_duration
-                        .observe(duration.as_secs_f64());
-                    SECONDARY_MODE.upload_heatmap_errors.inc();
+                   
+
                    last_upload
                }
                Err(UploadHeatmapError::Cancelled) => {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -715,13 +715,34 @@ pub(crate) enum LayerId {
 }

 /// Uniquely identify a layer visit by the layer
-/// and LSN floor (or start LSN) of the reads.
-/// The layer itself is not enough since we may
-/// have different LSN lower bounds for delta layer reads.
+/// and LSN range of the reads. Note that the end of the range is exclusive.
+///
+/// The layer itself is not enough since we may have different LSN lower
+/// bounds for delta layer reads. Scenarios where this can happen are:
+///
+/// 1. Layer overlaps: imagine an image layer inside and in-memory layer
+///    and a query that only partially hits the image layer. Part of the query
+///    needs to read the whole in-memory layer and the other part needs to read
+///    only up to the image layer. Hence, they'll have different LSN floor values
+///    for the read.
+///
+/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine
+///    The start LSN for one range is inside a layer and the start LSN for another range
+///    Is above the layer (includes all of it). Both ranges need to read the layer all the
+///    Way to the end but starting at different points. Hence, they'll have different LSN
+///    Ceil values.
+///
+/// The implication is that we might visit the same layer multiple times
+/// in order to read different LSN ranges from it. In practice, this isn't very concerning
+/// because:
+/// 1. Layer overlaps are rare and generally not intended
+/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs
+///    are grouped tightly enough (likely the case).
 #[derive(Debug, PartialEq, Eq, Clone, Hash)]
 struct LayerToVisitId {
    layer_id: LayerId,
    lsn_floor: Lsn,
+    lsn_ceil: Lsn,
 }

 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -805,6 +826,7 @@ impl LayerFringe {
        let layer_to_visit_id = LayerToVisitId {
            layer_id: layer.id(),
            lsn_floor: lsn_range.start,
+            lsn_ceil: lsn_range.end,
        };

        let entry = self.visit_reads.entry(layer_to_visit_id.clone());
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 use bytes::Bytes;
 use pageserver_api::key::{KEY_SIZE, Key};
 use pageserver_api::value::Value;
+use tokio_util::sync::CancellationToken;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::shard::TenantShardId;
@@ -179,7 +180,7 @@ impl BatchLayerWriter {

 /// An image writer that takes images and produces multiple image layers.
 #[must_use]
-pub struct SplitImageLayerWriter {
+pub struct SplitImageLayerWriter<'a> {
    inner: ImageLayerWriter,
    target_layer_size: u64,
    lsn: Lsn,
@@ -188,9 +189,12 @@ pub struct SplitImageLayerWriter {
    tenant_shard_id: TenantShardId,
    batches: BatchLayerWriter,
    start_key: Key,
+    gate: &'a utils::sync::gate::Gate,
+    cancel: CancellationToken,
 }

-impl SplitImageLayerWriter {
+impl<'a> SplitImageLayerWriter<'a> {
+    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -198,6 +202,8 @@ impl SplitImageLayerWriter {
        start_key: Key,
        lsn: Lsn,
        target_layer_size: u64,
+        gate: &'a utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        Ok(Self {
@@ -208,6 +214,8 @@ impl SplitImageLayerWriter {
                tenant_shard_id,
                &(start_key..Key::MAX),
                lsn,
+                gate,
+                cancel.clone(),
                ctx,
            )
            .await?,
@@ -217,6 +225,8 @@ impl SplitImageLayerWriter {
            batches: BatchLayerWriter::new(conf).await?,
            lsn,
            start_key,
+            gate,
+            cancel,
        })
    }

@@ -239,6 +249,8 @@ impl SplitImageLayerWriter {
                self.tenant_shard_id,
                &(key..Key::MAX),
                self.lsn,
+                self.gate,
+                self.cancel.clone(),
                ctx,
            )
            .await?;
@@ -291,7 +303,7 @@ impl SplitImageLayerWriter {
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
 /// will split them into multiple files based on size.
 #[must_use]
-pub struct SplitDeltaLayerWriter {
+pub struct SplitDeltaLayerWriter<'a> {
    inner: Option<(Key, DeltaLayerWriter)>,
    target_layer_size: u64,
    conf: &'static PageServerConf,
@@ -300,15 +312,19 @@ pub struct SplitDeltaLayerWriter {
    lsn_range: Range<Lsn>,
    last_key_written: Key,
    batches: BatchLayerWriter,
+    gate: &'a utils::sync::gate::Gate,
+    cancel: CancellationToken,
 }

-impl SplitDeltaLayerWriter {
+impl<'a> SplitDeltaLayerWriter<'a> {
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        lsn_range: Range<Lsn>,
        target_layer_size: u64,
+        gate: &'a utils::sync::gate::Gate,
+        cancel: CancellationToken,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            target_layer_size,
@@ -319,6 +335,8 @@ impl SplitDeltaLayerWriter {
            lsn_range,
            last_key_written: Key::MIN,
            batches: BatchLayerWriter::new(conf).await?,
+            gate,
+            cancel,
        })
    }

@@ -344,6 +362,8 @@ impl SplitDeltaLayerWriter {
                    self.tenant_shard_id,
                    key,
                    self.lsn_range.clone(),
+                    self.gate,
+                    self.cancel.clone(),
                    ctx,
                )
                .await?,
@@ -362,6 +382,8 @@ impl SplitDeltaLayerWriter {
                    self.tenant_shard_id,
                    key,
                    self.lsn_range.clone(),
+                    self.gate,
+                    self.cancel.clone(),
                    ctx,
                )
                .await?;
@@ -469,6 +491,8 @@ mod tests {
            get_key(0),
            Lsn(0x18),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
            &ctx,
        )
        .await
@@ -480,6 +504,8 @@ mod tests {
            tenant.tenant_shard_id,
            Lsn(0x18)..Lsn(0x20),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
        )
        .await
        .unwrap();
@@ -546,6 +572,8 @@ mod tests {
            get_key(0),
            Lsn(0x18),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
            &ctx,
        )
        .await
@@ -556,6 +584,8 @@ mod tests {
            tenant.tenant_shard_id,
            Lsn(0x18)..Lsn(0x20),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
        )
        .await
        .unwrap();
@@ -643,6 +673,8 @@ mod tests {
            get_key(0),
            Lsn(0x18),
            4 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
            &ctx,
        )
        .await
@@ -654,6 +686,8 @@ mod tests {
            tenant.tenant_shard_id,
            Lsn(0x18)..Lsn(0x20),
            4 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
        )
        .await
        .unwrap();
@@ -730,6 +764,8 @@ mod tests {
            tenant.tenant_shard_id,
            Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
            4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
        )
        .await
        .unwrap();
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -50,6 +50,7 @@ use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_epoll_uring::IoBuf;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -400,12 +401,15 @@ impl DeltaLayerWriterInner {
    ///
    /// Start building a new delta layer.
    ///
+    #[allow(clippy::too_many_arguments)]
    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename. We don't know
@@ -420,7 +424,7 @@ impl DeltaLayerWriterInner {
        let mut file = VirtualFile::create(&path, ctx).await?;
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -628,12 +632,15 @@ impl DeltaLayerWriter {
    ///
    /// Start building a new delta layer.
    ///
+    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        Ok(Self {
@@ -644,6 +651,8 @@ impl DeltaLayerWriter {
                    tenant_shard_id,
                    key_start,
                    lsn_range,
+                    gate,
+                    cancel,
                    ctx,
                )
                .await?,
@@ -1885,6 +1894,8 @@ pub(crate) mod test {
            harness.tenant_shard_id,
            entries_meta.key_range.start,
            entries_meta.lsn_range.clone(),
+            &timeline.gate,
+            timeline.cancel.clone(),
            &ctx,
        )
        .await?;
@@ -2079,6 +2090,8 @@ pub(crate) mod test {
                tenant.tenant_shard_id,
                Key::MIN,
                Lsn(0x11)..truncate_at,
+                &branch.gate,
+                branch.cancel.clone(),
                ctx,
            )
            .await
@@ -2213,6 +2226,8 @@ pub(crate) mod test {
            tenant.tenant_shard_id,
            *key_start,
            (*lsn_min)..lsn_end,
+            &tline.gate,
+            tline.cancel.clone(),
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -48,6 +48,7 @@ use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -748,12 +749,15 @@ impl ImageLayerWriterInner {
    ///
    /// Start building a new image layer.
    ///
+    #[allow(clippy::too_many_arguments)]
    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
@@ -780,7 +784,7 @@ impl ImageLayerWriterInner {
        };
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -878,14 +882,6 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;

-        // Calculate compression ratio
-        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
-            .inc_by(self.uncompressed_bytes_eligible);
-        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
-        crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
-
        let mut file = self.blob_writer.into_inner();

        // Write out the index
@@ -988,18 +984,30 @@ impl ImageLayerWriter {
    ///
    /// Start building a new image layer.
    ///
+    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
-                    .await?,
+                ImageLayerWriterInner::new(
+                    conf,
+                    timeline_id,
+                    tenant_shard_id,
+                    key_range,
+                    lsn,
+                    gate,
+                    cancel,
+                    ctx,
+                )
+                .await?,
            ),
        })
    }
@@ -1192,7 +1200,7 @@ mod test {

        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
        let range = input_start..input_end;

        // Build an image layer to filter
@@ -1203,6 +1211,8 @@ mod test {
                harness.tenant_shard_id,
                &range,
                lsn,
+                &timeline.gate,
+                timeline.cancel.clone(),
                &ctx,
            )
            .await
@@ -1235,7 +1245,7 @@ mod test {
            let shard_identity = ShardIdentity::new(
                ShardNumber(shard_number),
                shard_count,
-                ShardStripeSize(0x8000),
+                ShardStripeSize(0x800),
            )
            .unwrap();
            let harness = TenantHarness::create_custom(
@@ -1268,6 +1278,8 @@ mod test {
                harness.tenant_shard_id,
                &range,
                lsn,
+                &timeline.gate,
+                timeline.cancel.clone(),
                &ctx,
            )
            .await
@@ -1287,12 +1299,12 @@ mod test {

            // This exact size and those below will need updating as/when the layer encoding changes, but
            // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
+            assert_eq!(original_size, 122880);

            match shard_number {
                0 => {
                    // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                    let replacement = replacement.unwrap();

                    // We should have dropped some of the data
@@ -1300,7 +1312,7 @@ mod test {
                    assert!(replacement.metadata().file_size > 0);

                    // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                }
                1 => {
                    // Shard 1 has no keys in our input range
@@ -1309,19 +1321,19 @@ mod test {
                }
                2 => {
                    // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                    let replacement = replacement.unwrap();
                    assert!(replacement.metadata().file_size < original_size);
                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                }
                3 => {
                    // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
+                    assert_eq!(wrote_keys, 0x1000);
                    let replacement = replacement.unwrap();
                    assert!(replacement.metadata().file_size < original_size);
                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
+                    assert_eq!(replacement.metadata().file_size, 73728);
                }
                _ => unreachable!(),
            }
@@ -1346,6 +1358,8 @@ mod test {
            tenant.tenant_shard_id,
            &key_range,
            lsn,
+            &tline.gate,
+            tline.cancel.clone(),
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -32,7 +32,6 @@ use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
-use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
 use crate::tenant::timeline::GetVectoredError;
@@ -307,11 +306,7 @@ impl GlobalResourceUnits {
            }
        };

-        // This is a sloppy update: concurrent updates to the counter will race, and the exact
-        // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
-        // That's okay: as long as the metric contains some recent value, it doesn't have to always
-        // be literally the last update.
-        TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);
+       

        self.dirty_bytes = size;

@@ -719,6 +714,8 @@ impl InMemoryLayer {
        ctx: &RequestContext,
        key_range: Option<Range<Key>>,
        l0_flush_global_state: &l0_flush::Inner,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
@@ -759,6 +756,8 @@ impl InMemoryLayer {
            self.tenant_shard_id,
            Key::MIN,
            self.start_lsn..end_lsn,
+            gate,
+            cancel,
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -231,9 +231,7 @@ impl Layer {

        debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());

-        timeline
-            .metrics
-            .resident_physical_size_add(metadata.file_size);
+        

        ResidentLayer { downloaded, owner }
    }
@@ -526,12 +524,6 @@ impl Layer {
                }
            }

-            // Update the timeline's visible bytes count
-            if let Some(tl) = self.0.timeline.upgrade() {
-                tl.metrics
-                    .visible_physical_size_gauge
-                    .add(self.0.desc.file_size)
-            }
        }
    }

@@ -540,23 +532,10 @@ impl Layer {
        use LayerVisibilityHint::*;
        match (old_visibility, visibility) {
            (Visible, Covered) => {
-                // Subtract this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    debug_assert!(
-                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
-                    );
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .sub(self.0.desc.file_size)
-                }
+                
            }
            (Covered, Visible) => {
-                // Add this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .add(self.0.desc.file_size)
-                }
+                
            }
            (Covered, Covered) | (Visible, Visible) => {
                // no change
@@ -609,7 +588,6 @@ impl ResidentOrWantedEvicted {
            ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
            ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
                Some(strong) => {
-                    LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();

                    *self = ResidentOrWantedEvicted::Resident(strong.clone());

@@ -741,17 +719,8 @@ enum Status {

 impl Drop for LayerInner {
    fn drop(&mut self) {
-        // if there was a pending eviction, mark it cancelled here to balance metrics
-        if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
-        {
-            // eviction has already been started
-            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
-
-            // eviction request is intentionally not honored as no one is present to wait for it
-            // and we could be delaying shutdown for nothing.
-        }
-
-        let timeline = self.timeline.upgrade();
+        
+        let timeline: Option<Arc<Timeline>> = self.timeline.upgrade();

        if let Some(timeline) = timeline.as_ref() {
            // Only need to decrement metrics if the timeline still exists: otherwise
@@ -759,13 +728,6 @@ impl Drop for LayerInner {
            timeline.metrics.dec_layer(&self.desc);

            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
-                debug_assert!(
-                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
-                );
-                timeline
-                    .metrics
-                    .visible_physical_size_gauge
-                    .sub(self.desc.file_size);
            }
        }

@@ -777,7 +739,6 @@ impl Drop for LayerInner {

        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().layer_name();
-        let file_size = self.layer_desc().file_size;
        let meta = self.metadata();
        let status = self.status.take();

@@ -786,20 +747,13 @@ impl Drop for LayerInner {

            // carry this until we are finished for [`Layer::wait_drop`] support
            let _status = status;
-
            let Some(timeline) = timeline else {
                // no need to nag that timeline is gone: under normal situation on
                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
                return;
            };

-            let Ok(_guard) = timeline.gate.enter() else {
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
-                return;
-            };
-
-            let removed = match std::fs::remove_file(path) {
+           match std::fs::remove_file(path) {
                Ok(()) => true,
                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                    // until we no longer do detaches by removing all local files before removing the
@@ -810,34 +764,16 @@ impl Drop for LayerInner {
                    // layers.
                    false
                }
-                Err(e) => {
-                    tracing::error!("failed to remove wanted deleted layer: {e}");
-                    LAYER_IMPL_METRICS.inc_delete_removes_failed();
+                Err(_e) => {
                    false
                }
            };

-            if removed {
-                timeline.metrics.resident_physical_size_sub(file_size);
-            }
-            let res = timeline
+            
+            let _a=timeline
                .remote_client
                .schedule_deletion_of_unlinked(vec![(file_name, meta)]);

-            if let Err(e) = res {
-                // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                // demonstrating this deadlock (without spawn_blocking): stop will drop
-                // queued items, which will have ResidentLayer's, and those drops would try
-                // to re-entrantly lock the RemoteTimelineClient inner state.
-                if !timeline.is_active() {
-                    tracing::info!("scheduling deletion on drop failed: {e:#}");
-                } else {
-                    tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                }
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
-            } else {
-                LAYER_IMPL_METRICS.inc_completed_deletes();
-            }
        });
    }
 }
@@ -868,12 +804,6 @@ impl LayerInner {
        // This object acts as a RAII guard on these metrics: increment on construction
        timeline.metrics.inc_layer(&desc);

-        // New layers are visible by default. This metric is later updated on drop or in set_visibility
-        timeline
-            .metrics
-            .visible_physical_size_gauge
-            .add(desc.file_size);
-
        LayerInner {
            conf,
            path: local_path,
@@ -894,13 +824,9 @@ impl LayerInner {
    }

    fn delete_on_drop(&self) {
-        let res =
-            self.wanted_deleted
+                    let _a=self.wanted_deleted
                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);

-        if res.is_ok() {
-            LAYER_IMPL_METRICS.inc_started_deletes();
-        }
    }

    /// Cancellation safe, however dropping the future and calling this method again might result
@@ -938,12 +864,6 @@ impl LayerInner {
            // drop the DownloadedLayer outside of the holding the guard
            drop(strong);

-            // idea here is that only one evicter should ever get to witness a strong reference,
-            // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
-            // cancelled eviction and signal us, like it currently does.
-            //
-            // a second concurrent evict_and_wait will not see a strong reference.
-            LAYER_IMPL_METRICS.inc_started_evictions();
        }

        let changed = rx.changed();
@@ -983,15 +903,13 @@ impl LayerInner {
            // get_or_init_detached can:
            // - be fast (mutex lock) OR uncontested semaphore permit acquire
            // - be slow (wait for semaphore permit or closing)
-            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
            let locked = self
                .inner
                .get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
                .await
                .map(|mut guard| guard.get_and_upgrade().ok_or(guard));

-            scopeguard::ScopeGuard::into_inner(init_cancelled);
+

            match locked {
                // this path could had been a RwLock::read
@@ -1004,8 +922,7 @@ impl LayerInner {
                    // note that we also have dropped the Guard; this is fine, because we just made
                    // a state change and are holding a strong reference to be returned.
                    self.status.as_ref().unwrap().send_replace(Status::Resident);
-                    LAYER_IMPL_METRICS
-                        .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
+        

                    return Ok(strong);
                }
@@ -1032,8 +949,7 @@ impl LayerInner {
            .upgrade()
            .ok_or(DownloadError::TimelineShutdown)?;

-        // count cancellations, which currently remain largely unexpected
-        let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+        

        // check if we really need to be downloaded: this can happen if a read access won the
        // semaphore before eviction.
@@ -1045,7 +961,6 @@ impl LayerInner {
            .await
            .map_err(DownloadError::PreStatFailed);

-        scopeguard::ScopeGuard::into_inner(init_cancelled);

        let needs_download = needs_download?;

@@ -1056,7 +971,7 @@ impl LayerInner {
            self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
                .await?;

-            LAYER_IMPL_METRICS.inc_init_needed_no_download();
+            

            return Ok(self.initialize_after_layer_is_on_disk(permit));
        };
@@ -1097,13 +1012,13 @@ impl LayerInner {
        async move {
            tracing::info!(%reason, "downloading on-demand");

-            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+            
            let res = self
                .download_init_and_wait(timeline, permit, ctx.attached_child())
                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                .await?;

-            scopeguard::ScopeGuard::into_inner(init_cancelled);
+      
            Ok(res)
        }
        .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
@@ -1121,7 +1036,6 @@ impl LayerInner {
                    "unexpectedly on-demand downloading for task kind {:?}",
                    ctx.task_kind()
                );
-                crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();

                let really_error =
                    matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
@@ -1173,20 +1087,7 @@ impl LayerInner {

                let res = this.download_and_init(timeline, permit, &ctx).await;

-                if let Err(res) = tx.send(res) {
-                    match res {
-                        Ok(_res) => {
-                            tracing::debug!("layer initialized, but caller has been cancelled");
-                            LAYER_IMPL_METRICS.inc_init_completed_without_requester();
-                        }
-                        Err(e) => {
-                            tracing::info!(
-                                "layer file download failed, and caller has been cancelled: {e:?}"
-                            );
-                            LAYER_IMPL_METRICS.inc_download_failed_without_requester();
-                        }
-                    }
-                }
+                let _a =tx.send(res);
            }
            .in_current_span(),
        );
@@ -1238,21 +1139,9 @@ impl LayerInner {
                    }
                };
                tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
-                timeline
-                    .metrics
-                    .resident_physical_size_add(self.desc.file_size);
+    
                self.consecutive_failures.store(0, Ordering::Relaxed);

-                let since_last_eviction = self
-                    .last_evicted_at
-                    .lock()
-                    .unwrap()
-                    .take()
-                    .map(|ts| ts.elapsed());
-                if let Some(since_last_eviction) = since_last_eviction {
-                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
-                }
-
                self.access_stats.record_residence_event();

                Ok(self.initialize_after_layer_is_on_disk(permit))
@@ -1407,12 +1296,8 @@ impl LayerInner {

            tracing::debug!("eviction started");

-            let res = self.wait_for_turn_and_evict(only_version).await;
-            // metrics: ignore the Ok branch, it is not done yet
-            if let Err(e) = res {
-                tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
-                LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
-            }
+           let _a = self.wait_for_turn_and_evict(only_version).await;
+            
        };

        Self::spawn(start_evicting.instrument(span));
@@ -1532,21 +1417,13 @@ impl LayerInner {
        Self::spawn_blocking(move || {
            let _span = span.entered();

-            let res = self.evict_blocking(&timeline, &gate, &permit);
+            let res = self.evict_blocking( &gate, &permit);

            let waiters = self.inner.initializer_count();

-            if waiters > 0 {
-                LAYER_IMPL_METRICS.inc_evicted_with_waiters();
-            }
-
+            
            let completed_in = spawned_at.elapsed();
-            LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
-
-            match res {
-                Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
-                Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
-            }
+           

            tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
        });
@@ -1557,7 +1434,6 @@ impl LayerInner {
    /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
    fn evict_blocking(
        &self,
-        timeline: &Timeline,
        _gate: &gate::GateGuard,
        _permit: &heavier_once_cell::InitPermit,
    ) -> Result<(), EvictionCancelled> {
@@ -1570,17 +1446,7 @@ impl LayerInner {
                    Ok(elapsed) => {
                        let accessed_and_visible = self.access_stats.accessed()
                            && self.access_stats.visibility() == LayerVisibilityHint::Visible;
-                        if accessed_and_visible {
-                            // Only layers used for reads contribute to our "low residence" metric that is used
-                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
-                            // to be rapidly evicted without contributing to this metric.
-                            timeline
-                                .metrics
-                                .evictions_with_low_residence_duration
-                                .read()
-                                .unwrap()
-                                .observe(elapsed);
-                        }
+                        

                        tracing::info!(
                            residence_millis = elapsed.as_millis(),
@@ -1592,10 +1458,6 @@ impl LayerInner {
                        tracing::info!("evicted layer after unknown residence period");
                    }
                }
-                timeline.metrics.evictions.inc();
-                timeline
-                    .metrics
-                    .resident_physical_size_sub(self.desc.file_size);
            }
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                tracing::error!(
@@ -1812,13 +1674,7 @@ impl DownloadedLayer {
            match res {
                Ok(layer) => Ok(layer),
                Err(err) => {
-                    LAYER_IMPL_METRICS.inc_permanent_loading_failures();
-                    // We log this message once over the lifetime of `Self`
-                    // => Ok and good to log backtrace and path here.
-                    tracing::error!(
-                        "layer load failed, assuming permanent failure: {}: {err:?}",
-                        owner.path
-                    );
+                    
                    Err(err)
                }
            }
@@ -2026,218 +1882,6 @@ impl From<ResidentLayer> for Layer {
    }
 }

-use metrics::IntCounter;
-
-pub(crate) struct LayerImplMetrics {
-    started_evictions: IntCounter,
-    completed_evictions: IntCounter,
-    cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
-
-    started_deletes: IntCounter,
-    completed_deletes: IntCounter,
-    failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
-
-    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
-    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
-    redownload_after: metrics::Histogram,
-    time_to_evict: metrics::Histogram,
-}
-
-impl Default for LayerImplMetrics {
-    fn default() -> Self {
-        use enum_map::Enum;
-
-        // reminder: these will be pageserver_layer_* with "_total" suffix
-
-        let started_evictions = metrics::register_int_counter!(
-            "pageserver_layer_started_evictions",
-            "Evictions started in the Layer implementation"
-        )
-        .unwrap();
-        let completed_evictions = metrics::register_int_counter!(
-            "pageserver_layer_completed_evictions",
-            "Evictions completed in the Layer implementation"
-        )
-        .unwrap();
-
-        let cancelled_evictions = metrics::register_int_counter_vec!(
-            "pageserver_layer_cancelled_evictions_count",
-            "Different reasons for evictions to have been cancelled or failed",
-            &["reason"]
-        )
-        .unwrap();
-
-        let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let reason = EvictionCancelled::from_usize(i);
-            let s = reason.as_str();
-            cancelled_evictions.with_label_values(&[s])
-        }));
-
-        let started_deletes = metrics::register_int_counter!(
-            "pageserver_layer_started_deletes",
-            "Deletions on drop pending in the Layer implementation"
-        )
-        .unwrap();
-        let completed_deletes = metrics::register_int_counter!(
-            "pageserver_layer_completed_deletes",
-            "Deletions on drop completed in the Layer implementation"
-        )
-        .unwrap();
-
-        let failed_deletes = metrics::register_int_counter_vec!(
-            "pageserver_layer_failed_deletes_count",
-            "Different reasons for deletions on drop to have failed",
-            &["reason"]
-        )
-        .unwrap();
-
-        let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let reason = DeleteFailed::from_usize(i);
-            let s = reason.as_str();
-            failed_deletes.with_label_values(&[s])
-        }));
-
-        let rare_counters = metrics::register_int_counter_vec!(
-            "pageserver_layer_assumed_rare_count",
-            "Times unexpected or assumed rare event happened",
-            &["event"]
-        )
-        .unwrap();
-
-        let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let event = RareEvent::from_usize(i);
-            let s = event.as_str();
-            rare_counters.with_label_values(&[s])
-        }));
-
-        let inits_cancelled = metrics::register_int_counter!(
-            "pageserver_layer_inits_cancelled_count",
-            "Times Layer initialization was cancelled",
-        )
-        .unwrap();
-
-        let redownload_after = {
-            let minute = 60.0;
-            let hour = 60.0 * minute;
-            metrics::register_histogram!(
-                "pageserver_layer_redownloaded_after",
-                "Time between evicting and re-downloading.",
-                vec![
-                    10.0,
-                    30.0,
-                    minute,
-                    5.0 * minute,
-                    15.0 * minute,
-                    30.0 * minute,
-                    hour,
-                    12.0 * hour,
-                ]
-            )
-            .unwrap()
-        };
-
-        let time_to_evict = metrics::register_histogram!(
-            "pageserver_layer_eviction_held_permit_seconds",
-            "Time eviction held the permit.",
-            vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
-        )
-        .unwrap();
-
-        Self {
-            started_evictions,
-            completed_evictions,
-            cancelled_evictions,
-
-            started_deletes,
-            completed_deletes,
-            failed_deletes,
-
-            rare_counters,
-            inits_cancelled,
-            redownload_after,
-            time_to_evict,
-        }
-    }
-}
-
-impl LayerImplMetrics {
-    fn inc_started_evictions(&self) {
-        self.started_evictions.inc();
-    }
-    fn inc_completed_evictions(&self) {
-        self.completed_evictions.inc();
-    }
-    fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
-        self.cancelled_evictions[reason].inc()
-    }
-
-    fn inc_started_deletes(&self) {
-        self.started_deletes.inc();
-    }
-    fn inc_completed_deletes(&self) {
-        self.completed_deletes.inc();
-    }
-    fn inc_deletes_failed(&self, reason: DeleteFailed) {
-        self.failed_deletes[reason].inc();
-    }
-
-    /// Counted separatedly from failed layer deletes because we will complete the layer deletion
-    /// attempt regardless of failure to delete local file.
-    fn inc_delete_removes_failed(&self) {
-        self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
-    }
-
-    /// Expected rare just as cancellations are rare, but we could have cancellations separate from
-    /// the single caller which can start the download, so use this counter to separte them.
-    fn inc_init_completed_without_requester(&self) {
-        self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
-    }
-
-    /// Expected rare because cancellations are unexpected, and failures are unexpected
-    fn inc_download_failed_without_requester(&self) {
-        self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
-    }
-
-    /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
-    ///
-    /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
-    /// Option.
-    fn inc_raced_wanted_evicted_accesses(&self) {
-        self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
-    }
-
-    /// These are only expected for [`Self::inc_init_cancelled`] amount when
-    /// running with remote storage.
-    fn inc_init_needed_no_download(&self) {
-        self.rare_counters[RareEvent::InitWithoutDownload].inc();
-    }
-
-    /// Expected rare because all layer files should be readable and good
-    fn inc_permanent_loading_failures(&self) {
-        self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
-    }
-
-    fn inc_init_cancelled(&self) {
-        self.inits_cancelled.inc()
-    }
-
-    fn record_redownloaded_after(&self, duration: std::time::Duration) {
-        self.redownload_after.observe(duration.as_secs_f64())
-    }
-
-    /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
-    /// instead cancel eviction if we would have read waiters. We cannot however separate reads
-    /// from other evictions, so this could have noise as well.
-    fn inc_evicted_with_waiters(&self) {
-        self.rare_counters[RareEvent::EvictedWithWaiters].inc();
-    }
-
-    /// Recorded at least initially as the permit is now acquired in async context before
-    /// spawn_blocking action.
-    fn record_time_to_evict(&self, duration: std::time::Duration) {
-        self.time_to_evict.observe(duration.as_secs_f64())
-    }
-}

 #[derive(Debug, Clone, Copy, enum_map::Enum)]
 enum EvictionCancelled {
@@ -2254,21 +1898,6 @@ enum EvictionCancelled {
    UnexpectedEvictedState,
 }

-impl EvictionCancelled {
-    fn as_str(&self) -> &'static str {
-        match self {
-            EvictionCancelled::LayerGone => "layer_gone",
-            EvictionCancelled::TimelineGone => "timeline_gone",
-            EvictionCancelled::VersionCheckFailed => "version_check_fail",
-            EvictionCancelled::FileNotFound => "file_not_found",
-            EvictionCancelled::RemoveFailed => "remove_failed",
-            EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
-            EvictionCancelled::LostToDownload => "lost_to_download",
-            EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
-            EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
-        }
-    }
-}

 #[derive(enum_map::Enum)]
 enum DeleteFailed {
@@ -2276,15 +1905,6 @@ enum DeleteFailed {
    DeleteSchedulingFailed,
 }

-impl DeleteFailed {
-    fn as_str(&self) -> &'static str {
-        match self {
-            DeleteFailed::TimelineGone => "timeline_gone",
-            DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
-        }
-    }
-}
-
 #[derive(enum_map::Enum)]
 enum RareEvent {
    RemoveOnDropFailed,
@@ -2296,21 +1916,3 @@ enum RareEvent {
    EvictedWithWaiters,
 }

-impl RareEvent {
-    fn as_str(&self) -> &'static str {
-        use RareEvent::*;
-
-        match self {
-            RemoveOnDropFailed => "remove_on_drop_failed",
-            InitCompletedWithoutRequester => "init_completed_without",
-            DownloadFailedWithoutRequester => "download_failed_without",
-            UpgradedWantedEvicted => "raced_wanted_evicted",
-            InitWithoutDownload => "init_needed_no_download",
-            PermanentLoadingFailure => "permanent_loading_failure",
-            EvictedWithWaiters => "evicted_with_waiters",
-        }
-    }
-}
-
-pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
-    once_cell::sync::Lazy::new(LayerImplMetrics::default);
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -238,7 +238,7 @@ async fn smoke_test() {
        rtc.get_remote_physical_size(),
        dummy_layer.metadata().file_size
    );
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+   
 }

 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -311,11 +311,6 @@ async fn evict_and_wait_on_wanted_deleted() {

    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;

-    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
-    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
-    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
-    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 /// This test ensures we are able to read the layer while the layer eviction has been
@@ -366,7 +361,7 @@ fn read_wins_pending_eviction() {
        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
            .await
            .expect_err("should had been a timeout since we are holding the layer resident");
-        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+       

        let (completion, barrier) = utils::completion::channel();
        let (arrival, arrived_at_barrier) = utils::completion::channel();
@@ -398,18 +393,7 @@ fn read_wins_pending_eviction() {

        // works as intended: evictions lose to "downloads"
        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // this is not wrong: the eviction is technically still "on the way" as it's still queued
-        // because of a failpoint
-        assert_eq!(
-            0,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
+        

        drop(completion);

@@ -417,26 +401,9 @@ fn read_wins_pending_eviction() {
        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
            .await;

-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+        

-        // now we finally can observe the original eviction failing
-        // it would had been possible to observe it earlier, but here it is guaranteed to have
-        // happened.
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+        
    });
 }

@@ -499,7 +466,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
            .await
            .expect_err("should had been a timeout since we are holding the layer resident");
-        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+        

        let (completion1, barrier) = utils::completion::channel();
        let mut completion1 = Some(completion1);
@@ -534,20 +501,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {

        // works as intended: evictions lose to "downloads"
        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+       

-        // this is not wrong: the eviction is technically still "on the way" as it's still queued
-        // because of a failpoint
-        assert_eq!(
-            0,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+       

        // configure another failpoint for the second eviction -- evictions are per initialization,
        // so now that we've reinitialized the inner, we get to run two of them at the same time.
@@ -567,13 +523,10 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {

        arrived_at_barrier.wait().await;

-        assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());
+       

-        let mut release_earlier_eviction = |expected_reason| {
-            assert_eq!(
-                0,
-                LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
-            );
+        let mut release_earlier_eviction = |_expected_reason| {
+            

            drop(completion1.take().unwrap());

@@ -586,10 +539,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
                )
                .await;

-                assert_eq!(
-                    1,
-                    LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
-                );
+                
            }
        };

@@ -612,19 +562,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
            .expect("eviction goes through now that spawn_blocking is unclogged")
            .expect("eviction should succeed, because version matches");

-        assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
-
-        // ensure the cancelled are unchanged
-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS
-                .cancelled_evictions
-                .values()
-                .map(|ctr| ctr.get())
-                .sum::<u64>()
-        );
-
-        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+       
    });
 }

@@ -714,8 +652,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
        .unwrap_err();
    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");

-    // failpoint is not counted as cancellation either
-    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
+    
 }

 #[tokio::test(start_paused = true)]
@@ -892,8 +829,7 @@ async fn eviction_cancellation_on_drop() {
                .expect_err("should had been a timeout since we are holding the layer resident");
        }

-        // 1 == we only evict one of the layers
-        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+        

        drop(resident);

@@ -902,10 +838,7 @@ async fn eviction_cancellation_on_drop() {

        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;

-        assert_eq!(
-            1,
-            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
-        );
+        
    }
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -10,7 +10,7 @@ use std::time::{Duration, Instant};
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
 use rand::Rng;
-use scopeguard::defer;
+
 use tokio::sync::{Semaphore, SemaphorePermit};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -19,7 +19,6 @@ use utils::completion::Barrier;
 use utils::pausable_failpoint;

 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
@@ -85,17 +84,15 @@ pub(crate) enum BackgroundLoopKind {
    SecondaryDownload,
 }

-pub struct BackgroundLoopSemaphorePermit<'a> {
+pub struct BackgroundLoopSemaphorePermit {
    _permit: SemaphorePermit<'static>,
-    _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
 }

 /// Acquires a semaphore permit, to limit concurrent background jobs.
 pub(crate) async fn acquire_concurrency_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-) -> BackgroundLoopSemaphorePermit<'static> {
-    let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);
+) -> BackgroundLoopSemaphorePermit {

    if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
        pausable_failpoint!("initial-size-calculation-permit-pause");
@@ -108,11 +105,9 @@ pub(crate) async fn acquire_concurrency_permit(
    };
    let permit = semaphore.acquire().await.expect("should never close");

-    recorder.acquired();

    BackgroundLoopSemaphorePermit {
        _permit: permit,
-        _recorder: recorder,
    }
 }

@@ -135,8 +130,7 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
-                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
+
                compaction_loop(tenant, cancel)
                    // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -161,8 +155,6 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
-                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                gc_loop(tenant, cancel)
                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
@@ -186,8 +178,7 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
-                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
-                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
+
                tenant_housekeeping_loop(tenant, cancel)
                    .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
@@ -592,8 +583,5 @@ pub(crate) fn warn_when_period_overrun(
            ?task,
            "task iteration took longer than the configured period"
        );
-        metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task.into(), &format!("{}", period.as_secs())])
-            .inc();
    }
 }
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -45,8 +45,8 @@ pub struct Stats {
 }

 pub enum ThrottleResult {
-    NotThrottled { end: Instant },
-    Throttled { end: Instant },
+    NotThrottled {  },
+    Throttled {  },
 }

 impl Throttle {
@@ -114,7 +114,7 @@ impl Throttle {
        let inner = self.inner.load_full(); // clones the `Inner` Arc

        if !inner.enabled {
-            return ThrottleResult::NotThrottled { end: start };
+            return ThrottleResult::NotThrottled { };
        }

        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
@@ -127,9 +127,9 @@ impl Throttle {
            let wait_time = end - start;
            self.sum_throttled_usecs
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
-            ThrottleResult::Throttled { end }
+            ThrottleResult::Throttled {  }
        } else {
-            ThrottleResult::NotThrottled { end: start }
+            ThrottleResult::NotThrottled { }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -56,7 +56,8 @@ use crate::tenant::storage_layer::batch_split_writer::{
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
 use crate::tenant::storage_layer::{
-    AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
+    AsLayerDesc, LayerVisibilityHint, PersistentLayerDesc, PersistentLayerKey,
+    ValueReconstructState,
 };
 use crate::tenant::tasks::log_compaction_error;
 use crate::tenant::timeline::{
@@ -69,6 +70,13 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

+/// Ratio of shard-local pages below which we trigger shard ancestor layer rewrites. 0.3 means that
+/// <= 30% of layer pages must belong to the descendant shard to rewrite the layer.
+///
+/// We choose a value < 0.5 to avoid rewriting all visible layers every time we do a power-of-two
+/// shard split, which gets expensive for large tenants.
+const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3;
+
 #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
 pub struct GcCompactionJobId(pub usize);

@@ -80,6 +88,7 @@ impl std::fmt::Display for GcCompactionJobId {

 pub struct GcCompactionCombinedSettings {
    pub gc_compaction_enabled: bool,
+    pub gc_compaction_verification: bool,
    pub gc_compaction_initial_threshold_kb: u64,
    pub gc_compaction_ratio_percent: u64,
 }
@@ -225,6 +234,7 @@ impl GcCompactionQueue {
            gc_compaction_enabled,
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
+            ..
        } = timeline.get_gc_compaction_settings();
        if !gc_compaction_enabled {
            return Ok(());
@@ -747,8 +757,8 @@ impl KeyHistoryRetention {
    async fn pipe_to(
        self,
        key: Key,
-        delta_writer: &mut SplitDeltaLayerWriter,
-        mut image_writer: Option<&mut SplitImageLayerWriter>,
+        delta_writer: &mut SplitDeltaLayerWriter<'_>,
+        mut image_writer: Option<&mut SplitImageLayerWriter<'_>>,
        stat: &mut CompactionStatistics,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -788,6 +798,123 @@ impl KeyHistoryRetention {
        }
        Ok(())
    }
+
+    /// Verify if every key in the retention is readable by replaying the logs.
+    async fn verify(
+        &self,
+        key: Key,
+        base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>,
+        full_history: &[(Key, Lsn, Value)],
+        tline: &Arc<Timeline>,
+    ) -> anyhow::Result<()> {
+        // Usually the min_lsn should be the first record but we do a full iteration to be safe.
+        let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else {
+            // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
+            return Ok(());
+        };
+        let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else {
+            // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
+            return Ok(());
+        };
+        let mut base_img = base_img_from_ancestor
+            .as_ref()
+            .map(|(_, lsn, img)| (*lsn, img));
+        let mut history = Vec::new();
+
+        async fn collect_and_verify(
+            key: Key,
+            lsn: Lsn,
+            base_img: &Option<(Lsn, &Bytes)>,
+            history: &[(Lsn, &NeonWalRecord)],
+            tline: &Arc<Timeline>,
+            skip_empty: bool,
+        ) -> anyhow::Result<()> {
+            if base_img.is_none() && history.is_empty() {
+                if skip_empty {
+                    return Ok(());
+                }
+                anyhow::bail!("verification failed: key {} has no history at {}", key, lsn);
+            };
+
+            let mut records = history
+                .iter()
+                .map(|(lsn, val)| (*lsn, (*val).clone()))
+                .collect::<Vec<_>>();
+
+            // WAL redo requires records in the reverse LSN order
+            records.reverse();
+            let data = ValueReconstructState {
+                img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())),
+                records,
+            };
+
+            tline
+                .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
+                .await
+                .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
+
+            Ok(())
+        }
+
+        for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon {
+            for (lsn, val) in logs {
+                match val {
+                    Value::Image(img) => {
+                        base_img = Some((*lsn, img));
+                        history.clear();
+                    }
+                    Value::WalRecord(rec) if val.will_init() => {
+                        base_img = None;
+                        history.clear();
+                        history.push((*lsn, rec));
+                    }
+                    Value::WalRecord(rec) => {
+                        history.push((*lsn, rec));
+                    }
+                }
+            }
+            if *retain_lsn >= min_lsn {
+                // Only verify after the key appears in the full history for the first time.
+
+                // We don't modify history: in theory, we could replace the history with a single
+                // image as in `generate_key_retention` to make redos at later LSNs faster. But we
+                // want to verify everything as if they are read from the real layer map.
+                collect_and_verify(key, *retain_lsn, &base_img, &history, tline, false)
+                    .await
+                    .context("below horizon retain_lsn")?;
+            }
+        }
+
+        for (lsn, val) in &self.above_horizon.0 {
+            match val {
+                Value::Image(img) => {
+                    // Above the GC horizon, we verify every time we see an image.
+                    collect_and_verify(key, *lsn, &base_img, &history, tline, true)
+                        .await
+                        .context("above horizon full image")?;
+                    base_img = Some((*lsn, img));
+                    history.clear();
+                }
+                Value::WalRecord(rec) if val.will_init() => {
+                    // Above the GC horizon, we verify every time we see an init record.
+                    collect_and_verify(key, *lsn, &base_img, &history, tline, true)
+                        .await
+                        .context("above horizon init record")?;
+                    base_img = None;
+                    history.clear();
+                    history.push((*lsn, rec));
+                }
+                Value::WalRecord(rec) => {
+                    history.push((*lsn, rec));
+                }
+            }
+        }
+        // Ensure the latest record is readable.
+        collect_and_verify(key, max_lsn, &base_img, &history, tline, false)
+            .await
+            .context("latest record")?;
+        Ok(())
+    }
 }

 #[derive(Debug, Serialize, Default)]
@@ -1006,16 +1133,15 @@ impl Timeline {

        // 1. L0 Compact
        let l0_outcome = {
-            let timer = self.metrics.compact_time_histo.start_timer();
-            let l0_outcome = self
+            
+            self
                .compact_level0(
                    target_file_size,
                    options.flags.contains(CompactFlags::ForceL0Compaction),
                    ctx,
                )
-                .await?;
-            timer.stop_and_record();
-            l0_outcome
+                .await?
+        
        };

        if options.flags.contains(CompactFlags::OnlyL0Compaction) {
@@ -1112,14 +1238,23 @@ impl Timeline {
        let partition_count = self.partitioning.read().0.0.parts.len();

        // 4. Shard ancestor compaction
-
-        if self.shard_identity.count >= ShardCount::new(2) {
+        if self.get_compaction_shard_ancestor() && self.shard_identity.count >= ShardCount::new(2) {
            // Limit the number of layer rewrites to the number of partitions: this means its
            // runtime should be comparable to a full round of image layer creations, rather than
            // being potentially much longer.
            let rewrite_max = partition_count;

-            self.compact_shard_ancestors(rewrite_max, ctx).await?;
+            let outcome = self
+                .compact_shard_ancestors(
+                    rewrite_max,
+                    options.flags.contains(CompactFlags::YieldForL0),
+                    ctx,
+                )
+                .await?;
+            match outcome {
+                CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome),
+                CompactionOutcome::Done | CompactionOutcome::Skipped => {}
+            }
        }

        Ok(CompactionOutcome::Done)
@@ -1136,8 +1271,10 @@ impl Timeline {
    async fn compact_shard_ancestors(
        self: &Arc<Self>,
        rewrite_max: usize,
+        yield_for_l0: bool,
        ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
+        let mut outcome = CompactionOutcome::Done;
        let mut drop_layers = Vec::new();
        let mut layers_to_rewrite: Vec<Layer> = Vec::new();

@@ -1148,15 +1285,13 @@ impl Timeline {
        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
        // are rewriting layers.
        let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
-
-        tracing::info!(
-            "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
-            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.time
-        );
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;

        let layers = self.layers.read().await;
-        for layer_desc in layers.layer_map()?.iter_historic_layers() {
+        let layers_iter = layers.layer_map()?.iter_historic_layers();
+        let (layers_total, mut layers_checked) = (layers_iter.len(), 0);
+        for layer_desc in layers_iter {
+            layers_checked += 1;
            let layer = layers.get_from_desc(&layer_desc);
            if layer.metadata().shard.shard_count == self.shard_identity.count {
                // This layer does not belong to a historic ancestor, no need to re-image it.
@@ -1171,8 +1306,8 @@ impl Timeline {
                // This ancestral layer only covers keys that belong to other shards.
                // We include the full metadata in the log: if we had some critical bug that caused
                // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
-                info!(%layer, old_metadata=?layer.metadata(),
-                    "dropping layer after shard split, contains no keys for this shard.",
+                debug!(%layer, old_metadata=?layer.metadata(),
+                    "dropping layer after shard split, contains no keys for this shard",
                );

                if cfg!(debug_assertions) {
@@ -1200,14 +1335,15 @@ impl Timeline {
                continue;
            }

-            // Don't bother re-writing a layer unless it will at least halve its size
+            // Only rewrite a layer if we can reclaim significant space.
            if layer_local_page_count != u32::MAX
-                && layer_local_page_count > layer_raw_page_count / 2
+                && layer_local_page_count as f64 / layer_raw_page_count as f64
+                    <= ANCESTOR_COMPACTION_REWRITE_THRESHOLD
            {
                debug!(%layer,
-                    "layer is already mostly local ({}/{}), not rewriting",
-                    layer_local_page_count,
-                    layer_raw_page_count
+                    "layer has a large share of local pages \
+                        ({layer_local_page_count}/{layer_raw_page_count} > \
+                        {ANCESTOR_COMPACTION_REWRITE_THRESHOLD}), not rewriting",
                );
            }

@@ -1219,12 +1355,19 @@ impl Timeline {
                continue;
            }

+            // We do not yet implement rewrite of delta layers.
            if layer_desc.is_delta() {
-                // We do not yet implement rewrite of delta layers
                debug!(%layer, "Skipping rewrite of delta layer");
                continue;
            }

+            // We don't bother rewriting layers that aren't visible, since these won't be needed by
+            // reads and will likely be garbage collected soon.
+            if layer.visibility() != LayerVisibilityHint::Visible {
+                debug!(%layer, "Skipping rewrite of invisible layer");
+                continue;
+            }
+
            // Only rewrite layers if their generations differ.  This guarantees:
            //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
            //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
@@ -1234,19 +1377,36 @@ impl Timeline {
            }

            if layers_to_rewrite.len() >= rewrite_max {
-                tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
+                debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
                    layers_to_rewrite.len()
                );
-                continue;
+                outcome = CompactionOutcome::Pending;
+                break;
            }

            // Fall through: all our conditions for doing a rewrite passed.
            layers_to_rewrite.push(layer);
        }

-        // Drop read lock on layer map before we start doing time-consuming I/O
+        // Drop read lock on layer map before we start doing time-consuming I/O.
        drop(layers);

+        // Drop out early if there's nothing to do.
+        if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
+            return Ok(CompactionOutcome::Done);
+        }
+
+        info!(
+            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
+                checked {layers_checked}/{layers_total} layers \
+                (latest_gc_cutoff={} pitr_cutoff={})",
+            layers_to_rewrite.len(),
+            drop_layers.len(),
+            *latest_gc_cutoff,
+            pitr_cutoff,
+        );
+        let started = Instant::now();
+
        let mut replace_image_layers = Vec::new();

        for layer in layers_to_rewrite {
@@ -1254,13 +1414,15 @@ impl Timeline {
                return Err(CompactionError::ShuttingDown);
            }

-            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            info!(layer=%layer, "rewriting layer after shard split");
            let mut image_layer_writer = ImageLayerWriter::new(
                self.conf,
                self.timeline_id,
                self.tenant_shard_id,
                &layer.layer_desc().key_range,
                layer.layer_desc().image_layer_lsn(),
+                &self.gate,
+                self.cancel.clone(),
                ctx,
            )
            .await
@@ -1292,7 +1454,7 @@ impl Timeline {
                    .map_err(CompactionError::Other)?;
                let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
                    .map_err(CompactionError::Other)?;
-                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
                    layer.metadata().file_size,
                    new_layer.metadata().file_size);

@@ -1302,6 +1464,26 @@ impl Timeline {
                // the layer has no data for us with the ShardedRange check above, but
                drop_layers.push(layer);
            }
+
+            // Yield for L0 compaction if necessary, but make sure we update the layer map below
+            // with the work we've already done.
+            if yield_for_l0
+                && self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some()
+            {
+                info!("shard ancestor compaction yielding for L0 compaction");
+                outcome = CompactionOutcome::YieldForL0;
+                break;
+            }
+        }
+
+        for layer in &drop_layers {
+            info!(%layer, old_metadata=?layer.metadata(),
+                "dropping layer after shard split (no keys for this shard)",
+            );
        }

        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
@@ -1319,17 +1501,36 @@ impl Timeline {
        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
        // load.
-        match self.remote_client.wait_completion().await {
-            Ok(()) => (),
-            Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
-            Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                return Err(CompactionError::ShuttingDown);
+        if outcome != CompactionOutcome::YieldForL0 {
+            info!("shard ancestor compaction waiting for uploads");
+            tokio::select! {
+                result = self.remote_client.wait_completion() => match result {
+                    Ok(()) => {},
+                    Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
+                    Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
+                        return Err(CompactionError::ShuttingDown);
+                    }
+                },
+                // Don't wait if there's L0 compaction to do. We don't need to update the outcome
+                // here, because we've already done the actual work.
+                _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {},
            }
        }

+        info!(
+            "shard ancestor compaction done in {:.3}s{}",
+            started.elapsed().as_secs_f64(),
+            match outcome {
+                CompactionOutcome::Pending =>
+                    format!(", with pending work (rewrite_max={rewrite_max})"),
+                CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"),
+                CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(),
+            }
+        );
+
        fail::fail_point!("compact-shard-ancestors-persistent");

-        Ok(())
+        Ok(outcome)
    }

    /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
@@ -1861,6 +2062,8 @@ impl Timeline {
                                debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
                                lsn_range.clone()
                            },
+                            &self.gate,
+                            self.cancel.clone(),
                            ctx,
                        )
                        .await
@@ -2148,6 +2351,7 @@ impl Timeline {
    /// ```
    ///
    /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
+    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn generate_key_retention(
        self: &Arc<Timeline>,
        key: Key,
@@ -2156,6 +2360,7 @@ impl Timeline {
        retain_lsn_below_horizon: &[Lsn],
        delta_threshold_cnt: usize,
        base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
+        verification: bool,
    ) -> anyhow::Result<KeyHistoryRetention> {
        // Pre-checks for the invariants

@@ -2242,8 +2447,8 @@ impl Timeline {
            "should have at least below + above horizon batches"
        );
        let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
-        if let Some((key, lsn, img)) = base_img_from_ancestor {
-            replay_history.push((key, lsn, Value::Image(img)));
+        if let Some((key, lsn, ref img)) = base_img_from_ancestor {
+            replay_history.push((key, lsn, Value::Image(img.clone())));
        }

        /// Generate debug information for the replay history
@@ -2357,22 +2562,15 @@ impl Timeline {
            // Whether to reconstruct the image. In debug mode, we will generate an image
            // at every retain_lsn to ensure data is not corrupted, but we won't put the
            // image into the final layer.
-            let generate_image = produce_image || debug_mode;
-            if produce_image {
+            let img_and_lsn = if produce_image {
                records_since_last_image = 0;
-            }
-            let img_and_lsn = if generate_image {
                let replay_history_for_debug = if debug_mode {
                    Some(replay_history.clone())
                } else {
                    None
                };
                let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
-                let history = if produce_image {
-                    std::mem::take(&mut replay_history)
-                } else {
-                    replay_history.clone()
-                };
+                let history = std::mem::take(&mut replay_history);
                let mut img = None;
                let mut records = Vec::with_capacity(history.len());
                if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
@@ -2407,6 +2605,7 @@ impl Timeline {
                        records.push((lsn, rec));
                    }
                }
+                // WAL redo requires records in the reverse LSN order
                records.reverse();
                let state = ValueReconstructState { img, records };
                // last batch does not generate image so i is always in range, unless we force generate
@@ -2439,10 +2638,16 @@ impl Timeline {
        assert_eq!(retention.len(), lsn_split_points.len() + 1);
        for (idx, logs) in retention.into_iter().enumerate() {
            if idx == lsn_split_points.len() {
-                return Ok(KeyHistoryRetention {
+                let retention = KeyHistoryRetention {
                    below_horizon: result,
                    above_horizon: KeyLogAtLsn(logs),
-                });
+                };
+                if verification {
+                    retention
+                        .verify(key, &base_img_from_ancestor, full_history, self)
+                        .await?;
+                }
+                return Ok(retention);
            } else {
                result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
            }
@@ -2909,6 +3114,9 @@ impl Timeline {
            }
            (false, res)
        };
+
+        let verification = self.get_gc_compaction_settings().gc_compaction_verification;
+
        info!(
            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
            job_desc.selected_layers.len(),
@@ -3055,6 +3263,8 @@ impl Timeline {
                    job_desc.compaction_key_range.start,
                    lowest_retain_lsn,
                    self.get_compaction_target_size(),
+                    &self.gate,
+                    self.cancel.clone(),
                    ctx,
                )
                .await
@@ -3071,6 +3281,8 @@ impl Timeline {
            self.tenant_shard_id,
            lowest_retain_lsn..end_lsn,
            self.get_compaction_target_size(),
+            &self.gate,
+            self.cancel.clone(),
        )
        .await
        .context("failed to create delta layer writer")
@@ -3167,6 +3379,8 @@ impl Timeline {
                                self.tenant_shard_id,
                                desc.key_range.start,
                                desc.lsn_range.clone(),
+                                &self.gate,
+                                self.cancel.clone(),
                                ctx,
                            )
                            .await
@@ -3184,6 +3398,8 @@ impl Timeline {
                                self.tenant_shard_id,
                                job_desc.compaction_key_range.end,
                                desc.lsn_range.clone(),
+                                &self.gate,
+                                self.cancel.clone(),
                                ctx,
                            )
                            .await
@@ -3225,6 +3441,7 @@ impl Timeline {
                            .await
                            .context("failed to get ancestor image")
                            .map_err(CompactionError::Other)?,
+                        verification,
                    )
                    .await
                    .context("failed to generate key retention")
@@ -3265,6 +3482,7 @@ impl Timeline {
                    .await
                    .context("failed to get ancestor image")
                    .map_err(CompactionError::Other)?,
+                verification,
            )
            .await
            .context("failed to generate key retention")
@@ -3753,6 +3971,8 @@ impl CompactionJobExecutor for TimelineAdaptor {
            self.timeline.tenant_shard_id,
            key_range.start,
            lsn_range.clone(),
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
            ctx,
        )
        .await?;
@@ -3820,7 +4040,7 @@ impl TimelineAdaptor {
        key_range: &Range<Key>,
        ctx: &RequestContext,
    ) -> Result<(), CreateImageLayersError> {
-        let timer = self.timeline.metrics.create_images_time_histo.start_timer();
+       

        let image_layer_writer = ImageLayerWriter::new(
            self.timeline.conf,
@@ -3828,6 +4048,8 @@ impl TimelineAdaptor {
            self.timeline.tenant_shard_id,
            key_range,
            lsn,
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
            ctx,
        )
        .await?;
@@ -3864,7 +4086,7 @@ impl TimelineAdaptor {
            self.new_images.push(image_layer);
        }

-        timer.stop_and_record();
+        

        Ok(())
    }
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -30,6 +30,7 @@ use crate::tenant::storage_layer::{
    AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
    ValuesReconstructState,
 };
+use crate::tenant::timeline::VersionedKeySpaceQuery;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 #[derive(Debug, thiserror::Error)]
@@ -212,13 +213,9 @@ async fn generate_tombstone_image_layer(
        }
    }

+    let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn);
    let data = ancestor
-        .get_vectored_impl(
-            KeySpace::single(key_range.clone()),
-            image_lsn,
-            &mut reconstruct_state,
-            ctx,
-        )
+        .get_vectored_impl(query, &mut reconstruct_state, ctx)
        .await
        .context("failed to retrieve aux keys")
        .map_err(|e| Error::launder(e, Error::Prepare))?;
@@ -231,6 +228,8 @@ async fn generate_tombstone_image_layer(
            detached.tenant_shard_id,
            &key_range,
            image_lsn,
+            &detached.gate,
+            detached.cancel.clone(),
            ctx,
        )
        .await
@@ -779,6 +778,8 @@ async fn copy_lsn_prefix(
        target_timeline.tenant_shard_id,
        layer.layer_desc().key_range.start,
        layer.layer_desc().lsn_range.start..end_lsn,
+        &target_timeline.gate,
+        target_timeline.cancel.clone(),
        ctx,
    )
    .await
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -126,7 +126,7 @@ impl Timeline {
    ) -> ControlFlow<(), Instant> {
        debug!("eviction iteration: {policy:?}");
        let start = Instant::now();
-        let (period, threshold) = match policy {
+        let (period, _) = match policy {
            EvictionPolicy::NoEviction => {
                // check again in 10 seconds; XXX config watch mechanism
                return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
@@ -159,16 +159,6 @@ impl Timeline {
            period,
            BackgroundLoopKind::Eviction,
        );
-        // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I
-        // don't think that is a relevant fear however, and regardless the imitation should be the
-        // most costly part.
-        crate::metrics::EVICTION_ITERATION_DURATION
-            .get_metric_with_label_values(&[
-                &format!("{}", period.as_secs()),
-                &format!("{}", threshold.as_secs()),
-            ])
-            .unwrap()
-            .observe(elapsed.as_secs_f64());

        ControlFlow::Continue(start + period)
    }
@@ -325,7 +315,7 @@ impl Timeline {
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
+    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit> {
        let acquire_permit =
            crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx);

@@ -367,7 +357,7 @@ impl Timeline {
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
-        permit: BackgroundLoopSemaphorePermit<'static>,
+        permit: BackgroundLoopSemaphorePermit,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        if !self.tenant_shard_id.is_shard_zero() {
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -59,11 +59,7 @@ impl HeatmapLayersDownloader {
                    return;
                };

-                tracing::info!(
-                    resident_size=%timeline.resident_physical_size(),
-                    heatmap_layers=%heatmap.all_layers().count(),
-                    "Starting heatmap layers download"
-                );
+                

                let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
                    |layer| {
@@ -93,7 +89,7 @@ impl HeatmapLayersDownloader {
                tokio::select! {
                    _ = stream.collect::<()>() => {
                        tracing::info!(
-                            resident_size=%timeline.resident_physical_size(),
+                            
                            "Heatmap layers download completed"
                        );
                    },
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -738,6 +738,8 @@ impl ChunkProcessingJob {
            self.timeline.tenant_shard_id,
            &self.range,
            self.pgdata_lsn,
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
            ctx,
        )
        .await?;
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -290,7 +290,7 @@ impl OpenLayerManager {
        lsn: Lsn,
        last_freeze_at: &AtomicLsn,
        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
-        metrics: &TimelineMetrics,
+        _metrics: &TimelineMetrics,
    ) -> bool {
        let Lsn(last_record_lsn) = lsn;
        let end_lsn = Lsn(last_record_lsn + 1);
@@ -299,10 +299,6 @@ impl OpenLayerManager {
            let open_layer_rc = Arc::clone(open_layer);
            open_layer.freeze(end_lsn).await;

-            // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`.
-            // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a
-            // reference to the timeline metrics. Other methods use a metrics borrow as well.
-            metrics.inc_frozen_layer(open_layer);

            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
@@ -334,16 +330,12 @@ impl OpenLayerManager {
    pub(crate) fn track_new_image_layers(
        &mut self,
        image_layers: &[ResidentLayer],
-        metrics: &TimelineMetrics,
+        _metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);

-            // record these here instead of Layer::finish_creating because otherwise partial
-            // failure with create_image_layers would balloon up the physical size gauge. downside
-            // is that all layers need to be created before metrics are updated.
-            metrics.record_new_file_metrics(layer.layer_desc().file_size);
        }
        updates.flush();
    }
@@ -353,14 +345,13 @@ impl OpenLayerManager {
        &mut self,
        delta_layer: Option<&ResidentLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
-        metrics: &TimelineMetrics,
+        _metrics: &TimelineMetrics,
    ) {
        let inmem = self
            .layer_map
            .frozen_layers
            .pop_front()
            .expect("there must be a inmem layer to flush");
-        metrics.dec_frozen_layer(&inmem);

        // Only one task may call this function at a time (for this
        // timeline). If two tasks tried to flush the same frozen
@@ -370,7 +361,6 @@ impl OpenLayerManager {
        if let Some(l) = delta_layer {
            let mut updates = self.layer_map.batch_update();
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-            metrics.record_new_file_metrics(l.layer_desc().file_size);
            updates.flush();
        }
    }
@@ -380,12 +370,11 @@ impl OpenLayerManager {
        &mut self,
        compact_from: &[Layer],
        compact_to: &[ResidentLayer],
-        metrics: &TimelineMetrics,
+        _metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        for l in compact_from {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
@@ -438,7 +427,7 @@ impl OpenLayerManager {
        rewrite_layers: &[(Layer, ResidentLayer)],
        drop_layers: &[Layer],
        add_layers: &[ResidentLayer],
-        metrics: &TimelineMetrics,
+        _metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for (old_layer, new_layer) in rewrite_layers {
@@ -469,14 +458,12 @@ impl OpenLayerManager {
                &mut self.layer_fmgr,
            );

-            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
        }
        for l in drop_layers {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
        for l in add_layers {
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        updates.flush();
    }
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -23,7 +23,6 @@ pub(super) struct LogicalSize {
    /// the initial size at a different LSN.
    pub initial_logical_size: OnceCell<(
        u64,
-        crate::metrics::initial_logical_size::FinishedCalculationGuard,
    )>,

    /// Cancellation for the best-effort logical size calculation.
@@ -130,11 +129,7 @@ impl CurrentLogicalSize {
 impl LogicalSize {
    pub(super) fn empty_initial() -> Self {
        Self {
-            initial_logical_size: OnceCell::with_value((0, {
-                crate::metrics::initial_logical_size::START_CALCULATION
-                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
-                    .calculation_result_saved()
-            })),
+            initial_logical_size: OnceCell::with_value((0,)),
            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
@@ -159,7 +154,7 @@ impl LogicalSize {
        //                  ^^^ keep this type explicit so that the casts in this function break if
        //                  we change the type.
        match self.initial_logical_size.get() {
-            Some((initial_size, _)) => {
+            Some((initial_size, )) => {
                CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
                    .unwrap()))
@@ -181,7 +176,7 @@ impl LogicalSize {
    /// available for re-use. This doesn't contain the incremental part.
    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
+            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, )| *s),
            _ => None,
        }
    }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -39,10 +39,6 @@ use utils::postgres_client::{
 use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
 use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{
-    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
-    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
-};
 use crate::task_mgr::TaskKind;
 use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id};

@@ -76,11 +72,6 @@ pub(super) async fn connection_manager_loop_step(
        }
    }

-    WALRECEIVER_ACTIVE_MANAGERS.inc();
-    scopeguard::defer! {
-        WALRECEIVER_ACTIVE_MANAGERS.dec();
-    }
-
    let id = TenantTimelineId {
        tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
        timeline_id: connection_manager_state.timeline.timeline_id,
@@ -526,9 +517,6 @@ impl ConnectionManagerState {

    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
-        WALRECEIVER_SWITCHES
-            .with_label_values(&[new_sk.reason.name()])
-            .inc();

        self.drop_old_connection(true).await;

@@ -580,6 +568,7 @@ impl ConnectionManagerState {
                                );
                                Ok(())
                            }
+                            WalReceiverError::Cancelled => Ok(()),
                            WalReceiverError::Other(e) => {
                                // give out an error to have task_mgr give it a really verbose logging
                                if cancellation.is_cancelled() {
@@ -730,8 +719,6 @@ impl ConnectionManagerState {
            }
        };

-        WALRECEIVER_BROKER_UPDATES.inc();
-
        trace!(
            "safekeeper info update: standby_horizon(cutoff)={}",
            timeline_update.standby_horizon
@@ -741,10 +728,6 @@ impl ConnectionManagerState {
            self.timeline
                .standby_horizon
                .store(Lsn(timeline_update.standby_horizon));
-            self.timeline
-                .metrics
-                .standby_horizon_gauge
-                .set(timeline_update.standby_horizon as i64);
        }

        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -762,7 +745,6 @@ impl ConnectionManagerState {
                %new_safekeeper_id,
                "New SK node was added",
            );
-            WALRECEIVER_CANDIDATES_ADDED.inc();
        }
    }

@@ -1050,7 +1032,6 @@ impl ConnectionManagerState {
                    "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"
                );
                self.wal_connection_retries.remove(&node_id);
-                WALRECEIVER_CANDIDATES_REMOVED.inc();
            }
        }
    }
@@ -1077,6 +1058,7 @@ struct NewWalConnectionCandidate {
    safekeeper_id: NodeId,
    wal_source_connconf: PgConnectionConfig,
    availability_zone: Option<String>,
+    #[allow(dead_code)]
    reason: ReconnectReason,
 }

@@ -1105,18 +1087,6 @@ enum ReconnectReason {
    },
 }

-impl ReconnectReason {
-    fn name(&self) -> &str {
-        match self {
-            ReconnectReason::NoExistingConnection => "NoExistingConnection",
-            ReconnectReason::LaggingWal { .. } => "LaggingWal",
-            ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone",
-            ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout",
-            ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives",
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -36,7 +36,6 @@ use wal_decoder::wire_format::FromWireFormat;

 use super::TaskStateUpdate;
 use crate::context::RequestContext;
-use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS};
 use crate::pgdatadir_mapping::DatadirModification;
 use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::{
@@ -73,6 +72,7 @@ pub(super) enum WalReceiverError {
    /// Generic error
    Other(anyhow::Error),
    ClosedGate,
+    Cancelled,
 }

 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -136,7 +136,7 @@ pub(super) async fn handle_walreceiver_connection(
        GateError::GateClosed => WalReceiverError::ClosedGate,
    })?;

-    WALRECEIVER_STARTED_CONNECTIONS.inc();
+  

    // Connect to the database in replication mode.
    info!("connecting to {wal_source_connconf:?}");
@@ -200,6 +200,9 @@ pub(super) async fn handle_walreceiver_connection(
                                // with a similar error.
                            },
                            WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::Cancelled => {
+                                debug!("Connection cancelled")
+                            }
                            WalReceiverError::ClosedGate => {
                                // doesn't happen at runtime
                            }
@@ -219,10 +222,6 @@ pub(super) async fn handle_walreceiver_connection(
        .instrument(tracing::info_span!("poller")),
    );

-    let _guard = LIVE_CONNECTIONS
-        .with_label_values(&["wal_receiver"])
-        .guard();
-
    let identify = identify_system(&replication_client).await?;
    info!("{identify:?}");

@@ -273,7 +272,12 @@ pub(super) async fn handle_walreceiver_connection(

    let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);

-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
+        .await
+        .map_err(|e| match e.kind {
+            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+            _ => WalReceiverError::Other(e.into()),
+        })?;

    let shard = vec![*timeline.get_shard_identity()];

@@ -335,7 +339,7 @@ pub(super) async fn handle_walreceiver_connection(

        let status_update = match replication_message {
            ReplicationMessage::RawInterpretedWalRecords(raw) => {
-                WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
+               

                let mut uncommitted_records = 0;

@@ -408,21 +412,13 @@ pub(super) async fn handle_walreceiver_connection(
                    ctx: &RequestContext,
                    uncommitted: &mut u64,
                ) -> anyhow::Result<()> {
-                    let stats = modification.stats();
+                  
                    modification.commit(ctx).await?;
-                    WAL_INGEST.records_committed.inc_by(*uncommitted);
-                    WAL_INGEST.inc_values_committed(&stats);
+                  
                    *uncommitted = 0;
                    Ok(())
                }

-                if !records.is_empty() {
-                    timeline
-                        .metrics
-                        .wal_records_received
-                        .inc_by(records.len() as u64);
-                }
-
                for interpreted in records {
                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                        && uncommitted_records > 0
@@ -432,9 +428,6 @@ pub(super) async fn handle_walreceiver_connection(

                    let local_next_record_lsn = interpreted.next_record_lsn;

-                    if interpreted.is_observed() {
-                        WAL_INGEST.records_observed.inc();
-                    }

                    walingest
                        .ingest_record(interpreted, &mut modification, &ctx)
@@ -506,12 +499,9 @@ pub(super) async fn handle_walreceiver_connection(
                    filtered: &mut u64,
                    ctx: &RequestContext,
                ) -> anyhow::Result<()> {
-                    let stats = modification.stats();
+                   
                    modification.commit(ctx).await?;
-                    WAL_INGEST
-                        .records_committed
-                        .inc_by(*uncommitted - *filtered);
-                    WAL_INGEST.inc_values_committed(&stats);
+                   
                    *uncommitted = 0;
                    *filtered = 0;
                    Ok(())
@@ -525,7 +515,7 @@ pub(super) async fn handle_walreceiver_connection(

                trace!("received XLogData between {startlsn} and {endlsn}");

-                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
+            
                waldecoder.feed_bytes(data);

                {
@@ -567,7 +557,6 @@ pub(super) async fn handle_walreceiver_connection(
                        }

                        // Ingest the records without immediately committing them.
-                        timeline.metrics.wal_records_received.inc();
                        let ingested = walingest
                            .ingest_record(interpreted, &mut modification, &ctx)
                            .await
@@ -583,7 +572,7 @@ pub(super) async fn handle_walreceiver_connection(
                            })?;
                        if !ingested {
                            tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
-                            WAL_INGEST.records_filtered.inc();
+                          
                            filtered_records += 1;
                        }

--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -26,7 +26,7 @@ use utils::lsn::Lsn;
 use utils::vec_map::VecMap;

 use crate::context::RequestContext;
-use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
+use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, Header};
 use crate::virtual_file::{self, IoBufferMut, VirtualFile};

 /// Metadata bundled with the start and end offset of a blob.
@@ -111,18 +111,20 @@ impl From<Bytes> for BufView<'_> {
 pub struct VectoredBlob {
    /// Blob metadata.
    pub meta: BlobMeta,
-    /// Start offset.
-    start: usize,
+    /// Header start offset.
+    header_start: usize,
+    /// Data start offset.
+    data_start: usize,
    /// End offset.
    end: usize,
-    /// Compression used on the the blob.
+    /// Compression used on the data, extracted from the header.
    compression_bits: u8,
 }

 impl VectoredBlob {
    /// Reads a decompressed view of the blob.
    pub(crate) async fn read<'a>(&self, buf: &BufView<'a>) -> Result<BufView<'a>, std::io::Error> {
-        let view = buf.view(self.start..self.end);
+        let view = buf.view(self.data_start..self.end);

        match self.compression_bits {
            BYTE_UNCOMPRESSED => Ok(view),
@@ -140,13 +142,19 @@ impl VectoredBlob {
                    std::io::ErrorKind::InvalidData,
                    format!(
                        "Failed to decompress blob for {}@{}, {}..{}: invalid compression byte {bits:x}",
-                        self.meta.key, self.meta.lsn, self.start, self.end
+                        self.meta.key, self.meta.lsn, self.data_start, self.end
                    ),
                );
                Err(error)
            }
        }
    }
+
+    /// Returns the raw blob including header.
+    #[allow(unused)]
+    pub(crate) fn raw_with_header<'a>(&self, buf: &BufView<'a>) -> BufView<'a> {
+        buf.view(self.header_start..self.end)
+    }
 }

 impl std::fmt::Display for VectoredBlob {
@@ -154,7 +162,7 @@ impl std::fmt::Display for VectoredBlob {
        write!(
            f,
            "{}@{}, {}..{}",
-            self.meta.key, self.meta.lsn, self.start, self.end
+            self.meta.key, self.meta.lsn, self.data_start, self.end
        )
    }
 }
@@ -493,50 +501,28 @@ impl<'a> VectoredBlobReader<'a> {

        let blobs_at = read.blobs_at.as_slice();

-        let start_offset = read.start;
-
-        let mut metas = Vec::with_capacity(blobs_at.len());
+        let mut blobs = Vec::with_capacity(blobs_at.len());
        // Blobs in `read` only provide their starting offset. The end offset
        // of a blob is implicit: the start of the next blob if one exists
        // or the end of the read.

-        for (blob_start, meta) in blobs_at {
-            let blob_start_in_buf = blob_start - start_offset;
-            let first_len_byte = buf[blob_start_in_buf as usize];
+        for (blob_start, meta) in blobs_at.iter().copied() {
+            let header_start = (blob_start - read.start) as usize;
+            let header = Header::decode(&buf[header_start..])?;
+            let data_start = header_start + header.header_len;
+            let end = data_start + header.data_len;
+            let compression_bits = header.compression_bits;

-            // Each blob is prefixed by a header containing its size and compression information.
-            // Extract the size and skip that header to find the start of the data.
-            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
-            // 1 byte case and 1 in the 4 byte case.
-            let (size_length, blob_size, compression_bits) = if first_len_byte < 0x80 {
-                (1, first_len_byte as u64, BYTE_UNCOMPRESSED)
-            } else {
-                let mut blob_size_buf = [0u8; 4];
-                let offset_in_buf = blob_start_in_buf as usize;
-
-                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
-                blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
-
-                let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
-                (
-                    4,
-                    u32::from_be_bytes(blob_size_buf) as u64,
-                    compression_bits,
-                )
-            };
-
-            let start = (blob_start_in_buf + size_length) as usize;
-            let end = start + blob_size as usize;
-
-            metas.push(VectoredBlob {
-                start,
+            blobs.push(VectoredBlob {
+                header_start,
+                data_start,
                end,
-                meta: *meta,
+                meta,
                compression_bits,
            });
        }

-        Ok(VectoredBlobsBuf { buf, blobs: metas })
+        Ok(VectoredBlobsBuf { buf, blobs })
    }
 }

@@ -997,6 +983,15 @@ mod tests {
                &read_buf[..],
                "mismatch for idx={idx} at offset={offset}"
            );
+
+            // Check that raw_with_header returns a valid header.
+            let raw = read_blob.raw_with_header(&view);
+            let header = Header::decode(&raw)?;
+            if !compression || header.header_len == 1 {
+                assert_eq!(header.compression_bits, BYTE_UNCOMPRESSED);
+            }
+            assert_eq!(raw.len(), header.total_len());
+
            buf = result.buf;
        }
        Ok(())
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -10,7 +10,7 @@ use pageserver_api::models::PageserverUtilization;
 use utils::serde_percent::Percent;

 use crate::config::PageServerConf;
-use crate::metrics::NODE_UTILIZATION_SCORE;
+
 use crate::tenant::mgr::TenantManager;

 pub(crate) fn regenerate(
@@ -53,7 +53,7 @@ pub(crate) fn regenerate(
    // Express a static value for how many shards we may schedule on one node
    const MAX_SHARDS: u32 = 5000;

-    let mut doc = PageserverUtilization {
+    let doc = PageserverUtilization {
        disk_usage_bytes: used,
        free_space_bytes: free,
        disk_wanted_bytes,
@@ -63,10 +63,7 @@ pub(crate) fn regenerate(
        utilization_score: None,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };
-
-    // Initialize `PageserverUtilization::utilization_score`
-    let score = doc.cached_score();
-    NODE_UTILIZATION_SCORE.set(score);
+    

    Ok(doc)
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -27,12 +27,9 @@ use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 pub use pageserver_api::models::virtual_file as api;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-use tokio::time::Instant;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

-use crate::assert_u64_eq_usize::UsizeIsU64;
 use crate::context::RequestContext;
-use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
 use crate::page_cache::{PAGE_SZ, PageWriteGuard};
 pub(crate) mod io_engine;
 pub use io_engine::{
@@ -431,9 +428,7 @@ impl OpenFiles {
        if let Some(old_file) = slot_guard.file.take() {
            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
            // distinguish the two.
-            STORAGE_IO_TIME_METRIC
-                .get(StorageIoOperation::CloseByReplace)
-                .observe_closure_duration(|| drop(old_file));
+            drop(old_file);
        }

        // Prepare the slot for reuse and return it
@@ -532,13 +527,9 @@ impl<T> MaybeFatalIo<T> for std::io::Result<T> {
 /// where "support" means that we measure wall clock time.
 macro_rules! observe_duration {
    ($op:expr, $($body:tt)*) => {{
-        let instant = Instant::now();
-        let result = $($body)*;
-        let elapsed = instant.elapsed().as_secs_f64();
-        STORAGE_IO_TIME_METRIC
-            .get($op)
-            .observe(elapsed);
-        result
+        
+        $($body)*
+        
    }}
 }

@@ -913,7 +904,7 @@ impl VirtualFileInner {
        &self,
        buf: tokio_epoll_uring::Slice<Buf>,
        offset: u64,
-        ctx: &RequestContext,
+        _ctx: &RequestContext,
    ) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
    where
        Buf: tokio_epoll_uring::IoBufMut + Send,
@@ -930,9 +921,7 @@ impl VirtualFileInner {
        observe_duration!(StorageIoOperation::Read, {
            let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
            let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
-            if let Ok(size) = res {
-                ctx.io_size_metrics().read.add(size.into_u64());
-            }
+            
            (buf, res)
        })
    }
@@ -953,7 +942,7 @@ impl VirtualFileInner {
        &self,
        buf: FullSlice<B>,
        offset: u64,
-        ctx: &RequestContext,
+        _ctx: &RequestContext,
    ) -> (FullSlice<B>, Result<usize, Error>) {
        let file_guard = match self.lock_file().await {
            Ok(file_guard) => file_guard,
@@ -962,9 +951,7 @@ impl VirtualFileInner {
        observe_duration!(StorageIoOperation::Write, {
            let ((_file_guard, buf), result) =
                io_engine::get().write_at(file_guard, offset, buf).await;
-            if let Ok(size) = result {
-                ctx.io_size_metrics().write.add(size.into_u64());
-            }
+
            (buf, result)
        })
    }
@@ -1263,9 +1250,7 @@ impl Drop for VirtualFileInner {
                // there is also operation "close-by-replace" for closes done on eviction for
                // comparison.
                if let Some(fd) = slot_guard.file.take() {
-                    STORAGE_IO_TIME_METRIC
-                        .get(StorageIoOperation::Close)
-                        .observe_closure_duration(|| drop(fd));
+                    drop(fd);
                }
            }
        }
@@ -1334,7 +1319,6 @@ pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode, sync_mode: Syn
    set_io_mode(mode);
    io_engine::init(engine);
    SYNC_MODE.store(sync_mode as u8, std::sync::atomic::Ordering::Relaxed);
-    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }

 const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -54,14 +54,7 @@ static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8);
 pub(crate) fn set(engine_kind: IoEngineKind) {
    let engine: IoEngine = engine_kind.into();
    IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed);
-    #[cfg(not(test))]
-    {
-        let metric = &crate::metrics::virtual_file_io_engine::KIND;
-        metric.reset();
-        metric
-            .with_label_values(&[&format!("{engine_kind}")])
-            .set(1);
-    }
+    
 }

 #[cfg(not(test))]
--- a/Show More
+++ b/Show More