Merge pull request #5638 from neondatabase/releases/2023-10-24

Release 2023-10-24
Merge pull request #5610 from neondatabase/sharnoff/rc-2023-10-20-vm-monitor-fixes
2026-01-30 16:50:37 +00:00 · 2023-10-24 12:10:52 +03:00 · 2023-10-20 00:11:06 -07:00 · 2023-10-19 21:56:55 -07:00 · 2023-10-19 21:56:50 -07:00 · 2023-10-19 21:56:36 -07:00
176 changed files with 6294 additions and 7841 deletions
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,9 +17,9 @@ assignees: ''
 ## Implementation ideas


-```[tasklist]
 ## Tasks
-```
+- [ ]
+

 ## Other related tasks and Epics
 - 
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,6 +5,4 @@ self-hosted-runner:
    - small
    - us-east-2
 config-variables:
-  - REMOTE_STORAGE_AZURE_CONTAINER
-  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,10 +203,6 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
-        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
-          exit 0
-        fi
-
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}

        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -338,16 +338,6 @@ jobs:
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
-
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -433,7 +423,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
@@ -468,7 +458,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -847,7 +837,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.5
+      VM_BUILDER_VERSION: v0.18.1

    steps:
      - name: Checkout
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 5'
+    - cron: '0 7 * * 2'
  workflow_dispatch:

 jobs:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1609,6 +1609,16 @@ dependencies = [
 "subtle",
 ]

+[[package]]
+name = "ctor"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
+dependencies = [
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "ctr"
 version = "0.6.0"
@@ -2704,10 +2714,11 @@ dependencies = [

 [[package]]
 name = "log"
-version = "0.4.20"
+version = "0.4.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
 dependencies = [
+ "cfg-if",
 "value-bag",
 ]

@@ -3550,7 +3561,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3563,7 +3574,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3574,7 +3585,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3592,7 +3603,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -4419,7 +4430,6 @@ dependencies = [
 "itertools",
 "pageserver",
 "rand 0.8.5",
- "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
@@ -4478,7 +4488,6 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
- "tokio-stream",
 "toml_edit",
 "tracing",
 "url",
@@ -4681,16 +4690,6 @@ dependencies = [
 "serde_derive",
 ]

-[[package]]
-name = "serde_assert"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eda563240c1288b044209be1f0d38bb4d15044fb3e00dc354fbc922ab4733e80"
-dependencies = [
- "hashbrown 0.13.2",
- "serde",
-]
-
 [[package]]
 name = "serde_derive"
 version = "1.0.183"
@@ -5408,7 +5407,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -5977,7 +5976,6 @@ dependencies = [
 "routerify",
 "sentry",
 "serde",
- "serde_assert",
 "serde_json",
 "serde_with",
 "signal-hook",
@@ -6013,9 +6011,13 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"

 [[package]]
 name = "value-bag"
-version = "1.4.2"
+version = "1.0.0-alpha.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe"
+checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
+dependencies = [
+ "ctor",
+ "version_check",
+]

 [[package]]
 name = "vcpkg"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -124,7 +124,6 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
-serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
@@ -162,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -203,7 +202,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ################# Binary contents sections

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -156,7 +156,6 @@ fn main() -> Result<()> {
                let path = Path::new(sp);
                let file = File::open(path)?;
                spec = Some(serde_json::from_reader(file)?);
-                live_config_allowed = true;
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
                    live_config_allowed = true;
@@ -278,26 +277,32 @@ fn main() -> Result<()> {
        if #[cfg(target_os = "linux")] {
            use std::env;
            use tokio_util::sync::CancellationToken;
-            let vm_monitor_addr = matches
-                .get_one::<String>("vm-monitor-addr")
-                .expect("--vm-monitor-addr should always be set because it has a default arg");
+            use tracing::warn;
+            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
            let cgroup = matches.get_one::<String>("cgroup");
+            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
            // if you start a task in it it won't be dropped. However, make it
            // in the outermost scope just to be safe.
-            let rt = if env::var_os("AUTOSCALING").is_some() {
-                Some(
+            let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
+                (None, None) => None,
+                (None, Some(_)) => {
+                    warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
+                    None
+                }
+                (Some(_), None) => {
+                    panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
+                }
+                (Some(_), Some(_)) => Some(
                    tokio::runtime::Builder::new_multi_thread()
                        .worker_threads(4)
                        .enable_all()
                        .build()
-                        .expect("failed to create tokio runtime for monitor")
-                )
-            } else {
-                None
+                        .expect("failed to create tokio runtime for monitor"),
+                ),
            };

            // This token is used internally by the monitor to clean up all threads
@@ -308,7 +313,8 @@ fn main() -> Result<()> {
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
-                        addr: vm_monitor_addr.clone(),
+                        addr: vm_monitor_addr.cloned().unwrap(),
+                        file_cache_on_disk,
                    })),
                    token.clone(),
                ))
@@ -480,8 +486,6 @@ fn cli() -> clap::Command {
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query(
-            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
-            &[],
-        )?
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
-            replication: Some(row.get("rolreplication")),
-            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -24,7 +24,7 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneSpecResponse, (bool, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", format!("Bearer {}", jwt))
+        .header("Authorization", jwt)
        .send()
        .map_err(|e| {
            (
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
    base_uri: &str,
    compute_id: &str,
 ) -> Result<Option<ComputeSpec>> {
-    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
+    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
@@ -265,8 +265,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
-                || !r.bypassrls.unwrap_or(false)
-                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -298,8 +296,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String =
-                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
+                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,6 +2,7 @@ use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
 use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
 use utils::id::{NodeId, TenantId};

@@ -13,10 +14,12 @@ pub struct AttachmentService {

 const COMMAND: &str = "attachment_service";

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    pub node_id: Option<NodeId>,
+    pub pageserver_id: Option<NodeId>,
 }

 #[derive(Serialize, Deserialize)]
@@ -82,7 +85,7 @@ impl AttachmentService {
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach-hook")
+            .join("attach_hook")
            .unwrap();
        let client = reqwest::blocking::ClientBuilder::new()
            .build()
@@ -90,7 +93,7 @@ impl AttachmentService {

        let request = AttachHookRequest {
            tenant_id,
-            node_id: Some(pageserver_id),
+            pageserver_id: Some(pageserver_id),
        };

        let response = client.post(url).json(&request).send()?;
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -12,7 +12,6 @@ use hyper::{Body, Request, Response};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
-use utils::http::endpoint::request_span;
 use utils::logging::{self, LogFormat};
 use utils::signals::{ShutdownSignals, Signal};

@@ -172,7 +171,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
                id: *t,
-                gen: state.generation,
+                generation: state.generation,
            });
        }
    }
@@ -218,31 +217,14 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
        .tenants
        .entry(attach_req.tenant_id)
        .or_insert_with(|| TenantState {
-            pageserver: attach_req.node_id,
+            pageserver: attach_req.pageserver_id,
            generation: 0,
        });

-    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
+    if attach_req.pageserver_id.is_some() {
        tenant_state.generation += 1;
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            ps_id = %attaching_pageserver,
-            generation = %tenant_state.generation,
-            "issuing",
-        );
-    } else if let Some(ps_id) = tenant_state.pageserver {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            %ps_id,
-            generation = %tenant_state.generation,
-            "dropping",
-        );
-    } else {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            "no-op: tenant already has no pageserver");
    }
-    tenant_state.pageserver = attach_req.node_id;
+    tenant_state.pageserver = attach_req.pageserver_id;
    let generation = tenant_state.generation;

    locked.save().await.map_err(ApiError::InternalServerError)?;
@@ -250,7 +232,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    json_response(
        StatusCode::OK,
        AttachHookResponse {
-            gen: attach_req.node_id.map(|_| generation),
+            gen: attach_req.pageserver_id.map(|_| generation),
        },
    )
 }
@@ -258,9 +240,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/re-attach", handle_re_attach)
+        .post("/validate", handle_validate)
+        .post("/attach_hook", handle_attach_hook)
 }

 #[tokio::main]
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -798,24 +798,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
-        "reconfigure" => {
-            let endpoint_id = sub_args
-                .get_one::<String>("endpoint_id")
-                .ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
-            let endpoint = cplane
-                .endpoints
-                .get(endpoint_id.as_str())
-                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    Some(NodeId(
-                        id_str.parse().context("while parsing pageserver id")?,
-                    ))
-                } else {
-                    None
-                };
-            endpoint.reconfigure(pageserver_id)?;
-        }
        "stop" => {
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
@@ -1387,12 +1369,6 @@ fn cli() -> Command {
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
-                .subcommand(Command::new("reconfigure")
-                            .about("Reconfigure the endpoint")
-                            .arg(endpoint_pageserver_id_arg)
-                            .arg(endpoint_id_arg.clone())
-                            .arg(tenant_id_arg.clone())
-                )
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,7 @@ use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -56,10 +57,13 @@ use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
+#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
    endpoint_id: String,
+    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
    pg_port: u16,
@@ -410,32 +414,16 @@ impl Endpoint {
            );
        }

-        Ok(())
-    }
-
-    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
+        //
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
-        Ok(())
-    }

-    fn read_postgresql_conf(&self) -> Result<String> {
-        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
-        // memory. We will include it in the spec file that we pass to
-        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
-        // in the data directory.
-        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
-        match std::fs::read(&postgresql_conf_path) {
-            Ok(content) => Ok(String::from_utf8(content)?),
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
-            Err(e) => Err(anyhow::Error::new(e).context(format!(
-                "failed to read config file in {}",
-                postgresql_conf_path.to_str().unwrap()
-            ))),
-        }
+        Ok(())
    }

    pub fn start(
@@ -448,7 +436,21 @@ impl Endpoint {
            anyhow::bail!("The endpoint is already running");
        }

-        let postgresql_conf = self.read_postgresql_conf()?;
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => String::from_utf8(content)?,
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
+            Err(e) => {
+                return Err(anyhow::Error::new(e).context(format!(
+                    "failed to read config file in {}",
+                    postgresql_conf_path.to_str().unwrap()
+                )))
+            }
+        };

        // We always start the compute node from scratch, so if the Postgres
        // data dir exists from a previous launch, remove it first.
@@ -619,61 +621,6 @@ impl Endpoint {
        }
    }

-    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
-        let mut spec: ComputeSpec = {
-            let spec_path = self.endpoint_path().join("spec.json");
-            let file = std::fs::File::open(spec_path)?;
-            serde_json::from_reader(file)?
-        };
-
-        let postgresql_conf = self.read_postgresql_conf()?;
-        spec.cluster.postgresql_conf = Some(postgresql_conf);
-
-        if let Some(pageserver_id) = pageserver_id {
-            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
-            let mut endpoint_conf: EndpointConf = {
-                let file = std::fs::File::open(&endpoint_config_path)?;
-                serde_json::from_reader(file)?
-            };
-            endpoint_conf.pageserver_id = pageserver_id;
-            std::fs::write(
-                endpoint_config_path,
-                serde_json::to_string_pretty(&endpoint_conf)?,
-            )?;
-
-            let pageserver =
-                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-            let ps_http_conf = &pageserver.pg_connection_config;
-            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
-            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
-        }
-
-        let client = reqwest::blocking::Client::new();
-        let response = client
-            .post(format!(
-                "http://{}:{}/configure",
-                self.http_address.ip(),
-                self.http_address.port()
-            ))
-            .body(format!(
-                "{{\"spec\":{}}}",
-                serde_json::to_string_pretty(&spec)?
-            ))
-            .send()?;
-
-        let status = response.status();
-        if !(status.is_client_error() || status.is_server_error()) {
-            Ok(())
-        } else {
-            let url = response.url().to_owned();
-            let msg = match response.text() {
-                Ok(err_body) => format!("Error: {}", err_body),
-                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
-            };
-            Err(anyhow::anyhow!(msg))
-        }
-    }
-
    pub fn stop(&self, destroy: bool) -> Result<()> {
        // If we are going to destroy data directory,
        // use immediate shutdown mode, otherwise,
@@ -682,25 +629,15 @@ impl Endpoint {
        // Postgres is always started from scratch, so stop
        // without destroy only used for testing and debugging.
        //
-        self.pg_ctl(
-            if destroy {
-                &["-m", "immediate", "stop"]
-            } else {
-                &["stop"]
-            },
-            &None,
-        )?;
-
-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
-        //
-        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
+            self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
            println!(
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
+        } else {
+            self.pg_ctl(&["stop"], &None)?;
        }
        Ok(())
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context};
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
@@ -32,6 +33,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
+#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
@@ -57,6 +59,7 @@ pub struct LocalEnv {
    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub default_tenant_id: Option<TenantId>,

    // used to issue tokens during e.g pg start
@@ -81,6 +84,7 @@ pub struct LocalEnv {
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }

--- a/docs/updating-postgres.md
+++ b/docs/updating-postgres.md
@@ -1,108 +0,0 @@
-# Updating Postgres
-
-## Minor Versions
-
-When upgrading to a new minor version of Postgres, please follow these steps:
-
-_Example: 15.4 is the new minor version to upgrade to from 15.3._
-
-1. Clone the Neon Postgres repository if you have not done so already.
-
-    ```shell
-    git clone git@github.com:neondatabase/postgres.git
-    ```
-
-1. Add the Postgres upstream remote.
-
-    ```shell
-    git remote add upstream https://git.postgresql.org/git/postgresql.git
-    ```
-
-1. Create a new branch based on the stable branch you are updating.
-
-    ```shell
-    git checkout -b my-branch REL_15_STABLE_neon
-    ```
-
-1. Tag the last commit on the stable branch you are updating.
-
-    ```shell
-    git tag REL_15_3_neon
-    ```
-
-1. Push the new tag to the Neon Postgres repository.
-
-    ```shell
-    git push origin REL_15_3_neon
-    ```
-
-1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
-
-1. Rebase the branch you created on the tag and resolve any conflicts.
-
-    ```shell
-    git fetch upstream REL_15_4
-    git rebase REL_15_4
-    ```
-
-1. Run the Postgres test suite to make sure our commits have not affected
-Postgres in a negative way.
-
-    ```shell
-    make check
-    # OR
-    meson test -C builddir
-    ```
-
-1. Push your branch to the Neon Postgres repository.
-
-    ```shell
-    git push origin my-branch
-    ```
-
-1. Clone the Neon repository if you have not done so already.
-
-    ```shell
-    git clone git@github.com:neondatabase/neon.git
-    ```
-
-1. Create a new branch.
-
-1. Change the `revisions.json` file to point at the HEAD of your Postgres
-branch.
-
-1. Update the Git submodule.
-
-    ```shell
-    git submodule set-branch --branch my-branch vendor/postgres-v15
-    git submodule update --remote vendor/postgres-v15
-    ```
-
-1. Run the Neon test suite to make sure that Neon is still good to go on this
-minor Postgres release.
-
-    ```shell
-    ./scripts/poetry -k pg15
-    ```
-
-1. Commit your changes.
-
-1. Create a pull request, and wait for CI to go green.
-
-1. Force push the rebased Postgres branches into the Neon Postgres repository.
-
-    ```shell
-    git push --force origin my-branch:REL_15_STABLE_neon
-    ```
-
-    It may require disabling various branch protections.
-
-1. Update your Neon PR to point at the branches.
-
-    ```shell
-    git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
-    git commit --amend --no-edit
-    git push --force origin
-    ```
-
-1. Merge the pull request after getting approval(s) and CI completion.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -6,6 +6,7 @@
 use std::collections::HashMap;

 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -18,6 +19,7 @@ pub type PgIdent = String;

 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
+#[serde_as]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -48,12 +50,12 @@ pub struct ComputeSpec {
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
-
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub timeline_id: Option<TimelineId>,
-
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub pageserver_connstring: Option<String>,
-
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,

@@ -138,13 +140,14 @@ impl RemoteExtSpec {
    }
 }

+#[serde_as]
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeMode {
    /// A read-write node
    #[default]
    Primary,
    /// A read-only node, pinned at a particular LSN
-    Static(Lsn),
+    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
    /// A read-only node that follows the tip of the branch in hot standby mode
    ///
    /// Future versions may want to distinguish between replicas with hot standby
@@ -187,8 +190,6 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
-    pub replication: Option<bool>,
-    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -89,14 +89,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub fn set_build_info_metric(revision: &str, build_tag: &str) {
+pub fn set_build_info_metric(revision: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
        "Build/version information",
-        &["revision", "build_tag"]
+        &["revision"]
    )
    .expect("Failed to register build info metric");
-    metric.with_label_values(&[revision, build_tag]).set(1);
+    metric.with_label_values(&[revision]).set(1);
 }

 // Records I/O stats in a "cross-platform" way.
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,6 +4,7 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};

 #[derive(Serialize, Deserialize)]
@@ -11,10 +12,12 @@ pub struct ReAttachRequest {
    pub node_id: NodeId,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
-    pub gen: u32,
+    pub generation: u32,
 }

 #[derive(Serialize, Deserialize)]
@@ -22,8 +25,10 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -38,8 +43,10 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,7 +6,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
-use serde_with::serde_as;
+use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
    completion,
@@ -110,6 +110,7 @@ impl TenantState {
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
+            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
@@ -174,19 +175,25 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub new_timeline_id: TimelineId,
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub new_tenant_id: TenantId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
@@ -195,6 +202,7 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
 #[derive(Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLoadRequest {
@@ -271,26 +279,31 @@ pub struct LocationConfig {
    pub tenant_conf: TenantConfig,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct TenantCreateResponse(pub TenantId);
+pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);

 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -362,8 +375,10 @@ pub enum TenantAttachmentStatus {
    Failed { reason: String },
 }

+#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
+    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
@@ -374,22 +389,33 @@ pub struct TenantInfo {
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
+#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,

+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
    pub last_record_lsn: Lsn,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub prev_record_lsn: Option<Lsn>,
+    #[serde_as(as = "DisplayFromStr")]
    pub latest_gc_cutoff_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,

    /// The LSN that we have succesfully uploaded to remote storage
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,

    /// The LSN that we are advertizing to safekeepers
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn_visible: Lsn,

    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -401,6 +427,7 @@ pub struct TimelineInfo {
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
@@ -497,13 +524,23 @@ pub struct LayerAccessStats {
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

+#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
-    Open { lsn_start: Lsn },
-    Frozen { lsn_start: Lsn, lsn_end: Lsn },
+    Open {
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_start: Lsn,
+    },
+    Frozen {
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_start: Lsn,
+        #[serde_as(as = "DisplayFromStr")]
+        lsn_end: Lsn,
+    },
 }

+#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
@@ -511,7 +548,9 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

+        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
+        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
@@ -520,6 +559,7 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

+        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
 /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
 ///
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
-// Then we could replace the custom Ord and PartialOrd implementations below with
-// deriving them. This will require changes in walredoproc.c.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
+// Then we could replace the custo Ord and PartialOrd implementations below with
+// deriving them.
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -40,9 +40,21 @@ impl PartialOrd for RelTag {

 impl Ord for RelTag {
    fn cmp(&self, other: &Self) -> Ordering {
-        // Custom ordering where we put forknum to the end of the list
-        let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
-        (self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
+        let mut cmp = self.spcnode.cmp(&other.spcnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.dbnode.cmp(&other.dbnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.relnode.cmp(&other.relnode);
+        if cmp != Ordering::Equal {
+            return cmp;
+        }
+        cmp = self.forknum.cmp(&other.forknum);
+
+        cmp
    }
 }

--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -242,7 +242,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
        }
    }

-    /// Cancellation safe as long as the underlying IO is cancellation safe.
    async fn shutdown(&mut self) -> io::Result<()> {
        match self {
            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
@@ -394,23 +393,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        shutdown_watcher: F,
    ) -> Result<(), QueryError>
    where
-        F: Fn() -> S + Clone,
+        F: Fn() -> S,
        S: Future,
    {
-        let ret = self
-            .run_message_loop(handler, shutdown_watcher.clone())
-            .await;
-
-        tokio::select! {
-            _ = shutdown_watcher() => {
-                // do nothing; we most likely got already stopped by shutdown and will log it next.
-            }
-            _ = self.framed.shutdown() => {
-                // socket might be already closed, e.g. if previously received error,
-                // so ignore result.
-            },
-        }
-
+        let ret = self.run_message_loop(handler, shutdown_watcher).await;
+        // socket might be already closed, e.g. if previously received error,
+        // so ignore result.
+        self.framed.shutdown().await.ok();
        match ret {
            Ok(()) => Ok(()),
            Err(QueryError::Shutdown) => {
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -14,7 +14,6 @@ macro_rules! xlog_utils_test {
    ($version:ident) => {
        #[path = "."]
        mod $version {
-            #[allow(unused_imports)]
            pub use postgres_ffi::$version::wal_craft_test_export::*;
            #[allow(clippy::duplicate_mod)]
            #[cfg(test)]
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -214,24 +214,27 @@ where
    }
 }

-/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn flush<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
 ) -> Result<(), io::Error> {
    while write_buf.has_remaining() {
-        let bytes_written = stream.write_buf(write_buf).await?;
+        let bytes_written = stream.write(write_buf.chunk()).await?;
        if bytes_written == 0 {
            return Err(io::Error::new(
                ErrorKind::WriteZero,
                "failed to write message",
            ));
        }
+        // The advanced part will be garbage collected, likely during shifting
+        // data left on next attempt to write to buffer when free space is not
+        // enough.
+        write_buf.advance(bytes_written);
    }
+    write_buf.clear();
    stream.flush().await
 }

-/// Cancellation safe as long as the AsyncWrite is cancellation safe.
 async fn shutdown<S: AsyncWrite + Unpin>(
    stream: &mut S,
    write_buf: &mut BytesMut,
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -23,8 +23,8 @@ use tracing::debug;

 use crate::s3_bucket::RequestKind;
 use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
-    RemoteStorage, StorageMetadata,
+    AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
+    StorageMetadata,
 };

 pub struct AzureBlobStorage {
@@ -184,11 +184,10 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {

 #[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
-    async fn list(
+    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError> {
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_name(p))
@@ -196,19 +195,16 @@ impl RemoteStorage for AzureBlobStorage {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

-        let mut builder = self.client.list_blobs();
-
-        if let ListingMode::WithDelimiter = mode {
-            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-        }
+        let mut builder = self
+            .client
+            .list_blobs()
+            .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());

        if let Some(prefix) = list_prefix {
            builder = builder.prefix(Cow::from(prefix.to_owned()));
@@ -219,23 +215,46 @@ impl RemoteStorage for AzureBlobStorage {
        }

        let mut response = builder.into_stream();
-        let mut res = Listing::default();
-        while let Some(l) = response.next().await {
-            let entry = l.map_err(to_download_error)?;
-            let prefix_iter = entry
+        let mut res = Vec::new();
+        while let Some(entry) = response.next().await {
+            let entry = entry.map_err(to_download_error)?;
+            let name_iter = entry
                .blobs
                .prefixes()
                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.prefixes.extend(prefix_iter);
-
-            let blob_iter = entry
-                .blobs
-                .blobs()
-                .map(|k| self.name_to_relative_path(&k.name));
-            res.keys.extend(blob_iter);
+            res.extend(name_iter);
        }
        Ok(res)
    }
+
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let folder_name = folder
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone());
+
+        let mut builder = self.client.list_blobs();
+
+        if let Some(folder_name) = folder_name {
+            builder = builder.prefix(Cow::from(folder_name.to_owned()));
+        }
+
+        if let Some(limit) = self.max_keys_per_list_response {
+            builder = builder.max_results(MaxResults::new(limit));
+        }
+
+        let mut response = builder.into_stream();
+        let mut res = Vec::new();
+        while let Some(l) = response.next().await {
+            let entry = l.map_err(anyhow::Error::new)?;
+            let name_iter = entry
+                .blobs
+                .blobs()
+                .map(|bl| self.name_to_relative_path(&bl.name));
+            res.extend(name_iter);
+        }
+        Ok(res)
+    }
+
    async fn upload(
        &self,
        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -129,22 +129,6 @@ impl RemotePath {
    }
 }

-/// We don't need callers to be able to pass arbitrary delimiters: just control
-/// whether listings will use a '/' separator or not.
-///
-/// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
-/// NoDelimiter mode will only populate `keys`.
-pub enum ListingMode {
-    WithDelimiter,
-    NoDelimiter,
-}
-
-#[derive(Default)]
-pub struct Listing {
-    pub prefixes: Vec<RemotePath>,
-    pub keys: Vec<RemotePath>,
-}
-
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
@@ -157,13 +141,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::WithDelimiter)
-            .await?
-            .prefixes;
-        Ok(result)
-    }
+    ) -> Result<Vec<RemotePath>, DownloadError>;
+
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
@@ -175,16 +154,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
-    async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
-        Ok(result)
-    }
-
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        _mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError>;
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
@@ -235,9 +205,6 @@ pub enum DownloadError {
    BadInput(anyhow::Error),
    /// The file was not found in the remote storage.
    NotFound,
-    /// A cancellation token aborted the download, typically during
-    /// tenant detach or process shutdown.
-    Cancelled,
    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }
@@ -248,7 +215,6 @@ impl std::fmt::Display for DownloadError {
            DownloadError::BadInput(e) => {
                write!(f, "Failed to download a remote file due to user input: {e}")
            }
-            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
@@ -268,19 +234,6 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
-    pub async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> anyhow::Result<Listing, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list(prefix, mode).await,
-            Self::AwsS3(s) => s.list(prefix, mode).await,
-            Self::AzureBlob(s) => s.list(prefix, mode).await,
-            Self::Unreliable(s) => s.list(prefix, mode).await,
-        }
-    }
-
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -15,7 +15,7 @@ use tokio::{
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, RemotePath};

 use super::{RemoteStorage, StorageMetadata};

@@ -75,7 +75,7 @@ impl LocalFs {
    }

    #[cfg(test)]
-    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
@@ -89,10 +89,52 @@ impl LocalFs {
            })
            .collect())
    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let path = match prefix {
+            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+            None => Cow::Borrowed(&self.storage_root),
+        };
+
+        let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            .await
+            .map_err(DownloadError::Other)?;
+
+        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
+
+        // filter out empty directories to mirror s3 behavior.
+        for prefix in prefixes_to_filter {
+            if prefix.is_dir()
+                && is_directory_empty(&prefix)
+                    .await
+                    .map_err(DownloadError::Other)?
+            {
+                continue;
+            }
+
+            prefixes.push(
+                prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    ),
+            )
+        }
+
+        Ok(prefixes)
+    }

    // recursively lists all files in a directory,
    // mirroring the `list_files` for `s3_bucket`
-    async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let full_path = match folder {
            Some(folder) => folder.with_base(&self.storage_root),
            None => self.storage_root.clone(),
@@ -144,70 +186,6 @@ impl LocalFs {

        Ok(files)
    }
-}
-
-#[async_trait::async_trait]
-impl RemoteStorage for LocalFs {
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
-        let mut result = Listing::default();
-
-        if let ListingMode::NoDelimiter = mode {
-            let keys = self
-                .list_recursive(prefix)
-                .await
-                .map_err(DownloadError::Other)?;
-
-            result.keys = keys
-                .into_iter()
-                .filter(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
-                })
-                .collect();
-
-            return Ok(result);
-        }
-
-        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-            None => Cow::Borrowed(&self.storage_root),
-        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            let stripped = prefix
-                .strip_prefix(&self.storage_root)
-                .context("Failed to strip prefix")
-                .and_then(RemotePath::new)
-                .expect(
-                    "We list files for storage root, hence should be able to remote the prefix",
-                );
-
-            if prefix.is_dir() {
-                result.prefixes.push(stripped);
-            } else {
-                result.keys.push(stripped);
-            }
-        }
-
-        Ok(result)
-    }

    async fn upload(
        &self,
@@ -501,7 +479,7 @@ mod fs_tests {

        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
        assert_eq!(
-            storage.list_all().await?,
+            storage.list().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );
@@ -689,7 +667,7 @@ mod fs_tests {
        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
-        assert!(storage.list_all().await?.is_empty());
+        assert!(storage.list().await?.is_empty());

        storage
            .delete(&upload_target)
@@ -747,43 +725,6 @@ mod fs_tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn list() -> anyhow::Result<()> {
-        // No delimiter: should recursively list everything
-        let storage = create_storage()?;
-        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
-        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
-
-        let listing = storage.list(None, ListingMode::NoDelimiter).await?;
-        assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
-
-        // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter).await?;
-
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("timelines").unwrap()].to_vec()
-        );
-        assert!(listing.keys.is_empty());
-
-        // Delimiter & prefix
-        let listing = storage
-            .list(
-                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
-                ListingMode::WithDelimiter,
-            )
-            .await?;
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
-                .to_vec()
-        );
-        assert_eq!(listing.keys, [uncle.clone()].to_vec());
-
-        Ok(())
-    }
-
    async fn upload_dummy_file(
        storage: &LocalFs,
        name: &str,
@@ -836,7 +777,7 @@ mod fs_tests {
    }

    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
-        let mut files = storage.list_all().await?;
+        let mut files = storage.list().await?;
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,8 +30,8 @@ use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -299,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    async fn list(
+    /// See the doc for `RemoteStorage::list_prefixes`
+    /// Note: it wont include empty "directories"
+    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
+    ) -> Result<Vec<RemotePath>, DownloadError> {
        let kind = RequestKind::List;
-        let mut result = Listing::default();

        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
@@ -314,33 +314,28 @@ impl RemoteStorage for S3Bucket {
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });

+        let mut document_keys = Vec::new();
+
        let mut continuation_token = None;

        loop {
            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

-            let mut request = self
+            let fetch_response = self
                .client
                .list_objects_v2()
                .bucket(self.bucket_name.clone())
                .set_prefix(list_prefix.clone())
                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response);
-
-            if let ListingMode::WithDelimiter = mode {
-                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-            }
-
-            let response = request
+                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
+                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
                .context("Failed to list S3 prefixes")
@@ -350,35 +345,71 @@ impl RemoteStorage for S3Bucket {

            metrics::BUCKET_METRICS
                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
+                .observe_elapsed(kind, &fetch_response, started_at);

-            let response = response?;
+            let fetch_response = fetch_response?;

-            let keys = response.contents().unwrap_or_default();
-            let empty = Vec::new();
-            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
-
-            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
-
-            for object in keys {
-                let object_path = object.key().expect("response does not contain a key");
-                let remote_path = self.s3_object_to_relative_path(object_path);
-                result.keys.push(remote_path);
-            }
-
-            result.prefixes.extend(
-                prefixes
-                    .iter()
+            document_keys.extend(
+                fetch_response
+                    .common_prefixes
+                    .unwrap_or_default()
+                    .into_iter()
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match response.next_continuation_token {
+            continuation_token = match fetch_response.next_continuation_token {
                Some(new_token) => Some(new_token),
                None => break,
            };
        }

-        Ok(result)
+        Ok(document_keys)
+    }
+
+    /// See the doc for `RemoteStorage::list_files`
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let kind = RequestKind::List;
+
+        let folder_name = folder
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        // AWS may need to break the response into several parts
+        let mut continuation_token = None;
+        let mut all_files = vec![];
+        loop {
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);
+
+            let response = self
+                .client
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(folder_name.clone())
+                .set_continuation_token(continuation_token)
+                .set_max_keys(self.max_keys_per_list_response)
+                .send()
+                .await
+                .context("Failed to list files in S3 bucket");
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &response, started_at);
+
+            let response = response?;
+
+            for object in response.contents().unwrap_or_default() {
+                let object_path = object.key().expect("response does not contain a key");
+                let remote_path = self.s3_object_to_relative_path(object_path);
+                all_files.push(remote_path);
+            }
+            match response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+        Ok(all_files)
    }

    async fn upload(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -5,9 +5,7 @@ use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;

-use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
-};
+use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};

 pub struct UnreliableWrapper {
    inner: crate::GenericRemoteStorage,
@@ -97,15 +95,6 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list_files(folder).await
    }

-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-    ) -> Result<Listing, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
-        self.inner.list(prefix, mode).await
-    }
-
    async fn upload(
        &self,
        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,18 +1,23 @@
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};

 use utils::{
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
 };

+#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
+    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
+    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub peer_ids: Option<Vec<NodeId>>,
    pub pg_version: u32,
    pub system_id: Option<u64>,
    pub wal_seg_size: Option<u32>,
+    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    // If not passed, it is assigned to the beginning of commit_lsn segment.
    pub local_start_lsn: Option<Lsn>,
@@ -23,6 +28,7 @@ fn lsn_invalid() -> Lsn {
 }

 /// Data about safekeeper's timeline, mirrors broker.proto.
+#[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
    /// Term.
@@ -30,19 +36,25 @@ pub struct SkTimelineInfo {
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub flush_lsn: Lsn,
    /// Up to which LSN safekeeper regards its WAL as committed.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub commit_lsn: Lsn,
    /// LSN up to which safekeeper has backed WAL.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub backup_lsn: Lsn,
    /// LSN of last checkpoint uploaded by pageserver.
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub remote_consistent_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub peer_horizon_lsn: Lsn,
+    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub local_start_lsn: Lsn,
    /// A connection string to use for WAL receiving.
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -55,7 +55,6 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
-serde_assert.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -9,6 +9,7 @@ use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};

 use crate::id::TenantId;

@@ -31,9 +32,11 @@ pub enum Scope {
 }

 /// JWT payload. See docs/authentication.md for the format
+#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
+    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
    pub scope: Scope,
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 ///
 /// See docs/rfcs/025-generation-numbers.md for detail on how generation
 /// numbers are used.
-#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
 pub enum Generation {
    // Generations with this magic value will not add a suffix to S3 keys, and will not
    // be included in persisted index_part.json.  This value is only to be used
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -1,41 +0,0 @@
-/// Useful type for asserting that expected bytes match reporting the bytes more readable
-/// array-syntax compatible hex bytes.
-///
-/// # Usage
-///
-/// ```
-/// use utils::Hex;
-///
-/// let actual = serialize_something();
-/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
-///
-/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
-/// // output suffixed with an array style length for easier comparisons.
-/// assert_eq!(Hex(&actual), Hex(&expected));
-///
-/// // with `let expected = [0x68];` the error would had been:
-/// // assertion `left == right` failed
-/// //  left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
-/// // right: [0x68; 1]
-/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
-/// ```
-#[derive(PartialEq)]
-pub struct Hex<'a>(pub &'a [u8]);
-
-impl std::fmt::Debug for Hex<'_> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "[")?;
-        for (i, c) in self.0.chunks(16).enumerate() {
-            if i > 0 && !c.is_empty() {
-                writeln!(f, ", ")?;
-            }
-            for (j, b) in c.iter().enumerate() {
-                if j > 0 {
-                    write!(f, ", ")?;
-                }
-                write!(f, "0x{b:02x}")?;
-            }
-        }
-        write!(f, "; {}]", self.0.len())
-    }
-}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -14,11 +14,6 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
 use std::future::Future;
 use std::str::FromStr;

-use bytes::{Bytes, BytesMut};
-use std::io::Write as _;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -151,89 +146,94 @@ impl Drop for RequestCancelled {
    }
 }

-/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
-pub struct ChannelWriter {
-    buffer: BytesMut,
-    pub tx: mpsc::Sender<std::io::Result<Bytes>>,
-    written: usize,
-}
-
-impl ChannelWriter {
-    pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
-        assert_ne!(buf_len, 0);
-        ChannelWriter {
-            // split about half off the buffer from the start, because we flush depending on
-            // capacity. first flush will come sooner than without this, but now resizes will
-            // have better chance of picking up the "other" half. not guaranteed of course.
-            buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
-            tx,
-            written: 0,
-        }
-    }
-
-    pub fn flush0(&mut self) -> std::io::Result<usize> {
-        let n = self.buffer.len();
-        if n == 0 {
-            return Ok(0);
-        }
-
-        tracing::trace!(n, "flushing");
-        let ready = self.buffer.split().freeze();
-
-        // not ideal to call from blocking code to block_on, but we are sure that this
-        // operation does not spawn_blocking other tasks
-        let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
-            self.tx.send(Ok(ready)).await.map_err(|_| ())?;
-
-            // throttle sending to allow reuse of our buffer in `write`.
-            self.tx.reserve().await.map_err(|_| ())?;
-
-            // now the response task has picked up the buffer and hopefully started
-            // sending it to the client.
-            Ok(())
-        });
-        if res.is_err() {
-            return Err(std::io::ErrorKind::BrokenPipe.into());
-        }
-        self.written += n;
-        Ok(n)
-    }
-
-    pub fn flushed_bytes(&self) -> usize {
-        self.written
-    }
-}
-
-impl std::io::Write for ChannelWriter {
-    fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
-        let remaining = self.buffer.capacity() - self.buffer.len();
-
-        let out_of_space = remaining < buf.len();
-
-        let original_len = buf.len();
-
-        if out_of_space {
-            let can_still_fit = buf.len() - remaining;
-            self.buffer.extend_from_slice(&buf[..can_still_fit]);
-            buf = &buf[can_still_fit..];
-            self.flush0()?;
-        }
-
-        // assume that this will often under normal operation just move the pointer back to the
-        // beginning of allocation, because previous split off parts are already sent and
-        // dropped.
-        self.buffer.extend_from_slice(buf);
-        Ok(original_len)
-    }
-
-    fn flush(&mut self) -> std::io::Result<()> {
-        self.flush0().map(|_| ())
-    }
-}
-
 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    use bytes::{Bytes, BytesMut};
+    use std::io::Write as _;
+    use tokio::sync::mpsc;
+    use tokio_stream::wrappers::ReceiverStream;
+
    SERVE_METRICS_COUNT.inc();

+    /// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
+    struct ChannelWriter {
+        buffer: BytesMut,
+        tx: mpsc::Sender<std::io::Result<Bytes>>,
+        written: usize,
+    }
+
+    impl ChannelWriter {
+        fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
+            assert_ne!(buf_len, 0);
+            ChannelWriter {
+                // split about half off the buffer from the start, because we flush depending on
+                // capacity. first flush will come sooner than without this, but now resizes will
+                // have better chance of picking up the "other" half. not guaranteed of course.
+                buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
+                tx,
+                written: 0,
+            }
+        }
+
+        fn flush0(&mut self) -> std::io::Result<usize> {
+            let n = self.buffer.len();
+            if n == 0 {
+                return Ok(0);
+            }
+
+            tracing::trace!(n, "flushing");
+            let ready = self.buffer.split().freeze();
+
+            // not ideal to call from blocking code to block_on, but we are sure that this
+            // operation does not spawn_blocking other tasks
+            let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
+                self.tx.send(Ok(ready)).await.map_err(|_| ())?;
+
+                // throttle sending to allow reuse of our buffer in `write`.
+                self.tx.reserve().await.map_err(|_| ())?;
+
+                // now the response task has picked up the buffer and hopefully started
+                // sending it to the client.
+                Ok(())
+            });
+            if res.is_err() {
+                return Err(std::io::ErrorKind::BrokenPipe.into());
+            }
+            self.written += n;
+            Ok(n)
+        }
+
+        fn flushed_bytes(&self) -> usize {
+            self.written
+        }
+    }
+
+    impl std::io::Write for ChannelWriter {
+        fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
+            let remaining = self.buffer.capacity() - self.buffer.len();
+
+            let out_of_space = remaining < buf.len();
+
+            let original_len = buf.len();
+
+            if out_of_space {
+                let can_still_fit = buf.len() - remaining;
+                self.buffer.extend_from_slice(&buf[..can_still_fit]);
+                buf = &buf[can_still_fit..];
+                self.flush0()?;
+            }
+
+            // assume that this will often under normal operation just move the pointer back to the
+            // beginning of allocation, because previous split off parts are already sent and
+            // dropped.
+            self.buffer.extend_from_slice(buf);
+            Ok(original_len)
+        }
+
+        fn flush(&mut self) -> std::io::Result<()> {
+            self.flush0().map(|_| ())
+        }
+    }
+
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -3,7 +3,6 @@ use std::{fmt, str::FromStr};
 use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
-use serde::de::Visitor;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;

@@ -18,74 +17,12 @@ pub enum IdError {
 ///
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+///
+/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
+/// Check the `serde_with::serde_as` documentation for options for more complex types.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
 struct Id([u8; 16]);

-impl Serialize for Id {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            self.0.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for Id {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> Visitor<'de> for IdVisitor {
-            type Value = Id;
-
-            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 16])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 16] = Deserialize::deserialize(s)?;
-                Ok(Id::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Id::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                16,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 impl Id {
    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
        let mut arr = [0u8; 16];
@@ -371,112 +308,3 @@ impl fmt::Display for NodeId {
        write!(f, "{}", self.0)
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use serde_assert::{Deserializer, Serializer, Token, Tokens};
-
-    use crate::bin_ser::BeSer;
-
-    use super::*;
-
-    #[test]
-    fn test_id_serde_non_human_readable() {
-        let original_id = Id([
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ]);
-        let expected_tokens = Tokens(vec![
-            Token::Tuple { len: 16 },
-            Token::U8(173),
-            Token::U8(80),
-            Token::U8(132),
-            Token::U8(115),
-            Token::U8(129),
-            Token::U8(226),
-            Token::U8(72),
-            Token::U8(254),
-            Token::U8(170),
-            Token::U8(201),
-            Token::U8(135),
-            Token::U8(108),
-            Token::U8(199),
-            Token::U8(26),
-            Token::U8(228),
-            Token::U8(24),
-            Token::TupleEnd,
-        ]);
-
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let serialized_tokens = original_id.serialize(&serializer).unwrap();
-        assert_eq!(serialized_tokens, expected_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(serialized_tokens)
-            .build();
-        let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
-        assert_eq!(deserialized_id, original_id);
-    }
-
-    #[test]
-    fn test_id_serde_human_readable() {
-        let original_id = Id([
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ]);
-        let expected_tokens = Tokens(vec![Token::Str(String::from(
-            "ad50847381e248feaac9876cc71ae418",
-        ))]);
-
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let serialized_tokens = original_id.serialize(&serializer).unwrap();
-        assert_eq!(serialized_tokens, expected_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(Tokens(vec![Token::Str(String::from(
-                "ad50847381e248feaac9876cc71ae418",
-            ))]))
-            .build();
-        assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
-    }
-
-    macro_rules! roundtrip_type {
-        ($type:ty, $expected_bytes:expr) => {{
-            let expected_bytes: [u8; 16] = $expected_bytes;
-            let original_id = <$type>::from(expected_bytes);
-
-            let ser_bytes = original_id.ser().unwrap();
-            assert_eq!(ser_bytes, expected_bytes);
-
-            let des_id = <$type>::des(&ser_bytes).unwrap();
-            assert_eq!(des_id, original_id);
-        }};
-    }
-
-    #[test]
-    fn test_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(Id, expected_bytes);
-    }
-
-    #[test]
-    fn test_tenant_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(TenantId, expected_bytes);
-    }
-
-    #[test]
-    fn test_timeline_id_bincode_serde() {
-        let expected_bytes = [
-            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
-        ];
-
-        roundtrip_type!(TimelineId, expected_bytes);
-    }
-}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -24,10 +24,6 @@ pub mod auth;

 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
-
-mod hex;
-pub use hex::Hex;
-
 // http endpoint utils
 pub mod http;

@@ -77,8 +73,6 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

-pub mod sync;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
@@ -134,21 +128,6 @@ macro_rules! project_git_version {
    };
 }

-/// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
-#[macro_export]
-macro_rules! project_build_tag {
-    ($const_identifier:ident) => {
-        const $const_identifier: &::core::primitive::str = {
-            const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
-                ::core::option::Option::Some(x) => ["build_tag-env:", x],
-                ::core::option::Option::None => ["build_tag:", ""],
-            };
-
-            $crate::__const_format::concatcp!(__ARG[0], __ARG[1])
-        };
-    };
-}
-
 /// Re-export for `project_git_version` macro
 #[doc(hidden)]
 pub use const_format as __const_format;
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,7 +1,7 @@
 #![warn(missing_docs)]

 use camino::Utf8Path;
-use serde::{de::Visitor, Deserialize, Serialize};
+use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
 use std::str::FromStr;
@@ -13,114 +13,10 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
+#[serde(transparent)]
 pub struct Lsn(pub u64);

-impl Serialize for Lsn {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            self.0.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for Lsn {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct LsnVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> Visitor<'de> for LsnVisitor {
-            type Value = Lsn;
-
-            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str(
-                        "value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
-                    )
-                } else {
-                    formatter.write_str("value in form of integer(u64)")
-                }
-            }
-
-            fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Ok(Lsn(v))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                Lsn::from_str(v).map_err(|e| E::custom(e))
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(LsnVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_u64(LsnVisitor {
-                is_human_readable_deserializer: false,
-            })
-        }
-    }
-}
-
-/// Allows (de)serialization of an `Lsn` always as `u64`.
-///
-/// ### Example
-///
-/// ```rust
-/// # use serde::{Serialize, Deserialize};
-/// use utils::lsn::Lsn;
-///
-/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
-/// struct Foo {
-///   #[serde(with = "utils::lsn::serde_as_u64")]
-///   always_u64: Lsn,
-/// }
-///
-/// let orig = Foo { always_u64: Lsn(1234) };
-///
-/// let res = serde_json::to_string(&orig).unwrap();
-/// assert_eq!(res, r#"{"always_u64":1234}"#);
-///
-/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
-/// assert_eq!(foo, orig);
-/// ```
-///
-pub mod serde_as_u64 {
-    use super::Lsn;
-
-    /// Serializes the Lsn as u64 disregarding the human readability of the format.
-    ///
-    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
-    pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
-        use serde::Serialize;
-        lsn.0.serialize(serializer)
-    }
-
-    /// Deserializes the Lsn as u64 disregarding the human readability of the format.
-    ///
-    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
-    pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
-        use serde::Deserialize;
-        u64::deserialize(deserializer).map(Lsn)
-    }
-}
-
 /// We tried to parse an LSN from a string, but failed
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("LsnParseError")]
@@ -368,13 +264,8 @@ impl MonotonicCounter<Lsn> for RecordLsn {

 #[cfg(test)]
 mod tests {
-    use crate::bin_ser::BeSer;
-
    use super::*;

-    use serde::ser::Serialize;
-    use serde_assert::{Deserializer, Serializer, Token, Tokens};
-
    #[test]
    fn test_lsn_strings() {
        assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -450,95 +341,4 @@ mod tests {
        assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
        assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
    }
-
-    #[test]
-    fn test_lsn_serde() {
-        let original_lsn = Lsn(0x0123456789abcdef);
-        let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
-        let expected_non_readable_tokens =
-            Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
-
-        // Testing human_readable ser/de
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-        assert_eq!(readable_ser_tokens, expected_readable_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(readable_ser_tokens)
-            .build();
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-
-        // Testing NON human_readable ser/de
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-        assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(non_readable_ser_tokens)
-            .build();
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-
-        // Testing mismatching ser/de
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(true)
-            .tokens(non_readable_ser_tokens)
-            .build();
-        Lsn::deserialize(&mut deserializer).unwrap_err();
-
-        let serializer = Serializer::builder().is_human_readable(true).build();
-        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(readable_ser_tokens)
-            .build();
-        Lsn::deserialize(&mut deserializer).unwrap_err();
-    }
-
-    #[test]
-    fn test_lsn_ensure_roundtrip() {
-        let original_lsn = Lsn(0xaaaabbbb);
-
-        let serializer = Serializer::builder().is_human_readable(false).build();
-        let ser_tokens = original_lsn.serialize(&serializer).unwrap();
-
-        let mut deserializer = Deserializer::builder()
-            .is_human_readable(false)
-            .tokens(ser_tokens)
-            .build();
-
-        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-    }
-
-    #[test]
-    fn test_lsn_bincode_serde() {
-        let lsn = Lsn(0x0123456789abcdef);
-        let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
-
-        let ser_bytes = lsn.ser().unwrap();
-        assert_eq!(ser_bytes, expected_bytes);
-
-        let des_lsn = Lsn::des(&ser_bytes).unwrap();
-        assert_eq!(des_lsn, lsn);
-    }
-
-    #[test]
-    fn test_lsn_bincode_ensure_roundtrip() {
-        let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
-        let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
-
-        let ser_bytes = original_lsn.ser().unwrap();
-        assert_eq!(ser_bytes, expected_bytes);
-
-        let des_lsn = Lsn::des(&ser_bytes).unwrap();
-        assert_eq!(des_lsn, original_lsn);
-    }
 }
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -3,6 +3,7 @@ use std::time::{Duration, SystemTime};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use pq_proto::{read_cstr, PG_EPOCH};
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use tracing::{trace, warn};

 use crate::lsn::Lsn;
@@ -14,17 +15,21 @@ use crate::lsn::Lsn;
 ///
 /// serde Serialize is used only for human readable dump to json (e.g. in
 /// safekeepers debug_dump).
+#[serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PageserverFeedback {
    /// Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
    /// LSN last received and ingested by the pageserver. Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
    pub last_received_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver to its local disc.
    /// Controls backpressure.
+    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
    /// consider WAL before it can be removed.
+    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1 +0,0 @@
-pub mod heavier_once_cell;
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -1,383 +0,0 @@
-use std::sync::{
-    atomic::{AtomicUsize, Ordering},
-    Arc, Mutex, MutexGuard,
-};
-use tokio::sync::Semaphore;
-
-/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
-/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
-/// for the duration of initialization.
-///
-/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
-///
-/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
-pub struct OnceCell<T> {
-    inner: Mutex<Inner<T>>,
-    initializers: AtomicUsize,
-}
-
-impl<T> Default for OnceCell<T> {
-    /// Create new uninitialized [`OnceCell`].
-    fn default() -> Self {
-        Self {
-            inner: Default::default(),
-            initializers: AtomicUsize::new(0),
-        }
-    }
-}
-
-/// Semaphore is the current state:
-/// - open semaphore means the value is `None`, not yet initialized
-/// - closed semaphore means the value has been initialized
-#[derive(Debug)]
-struct Inner<T> {
-    init_semaphore: Arc<Semaphore>,
-    value: Option<T>,
-}
-
-impl<T> Default for Inner<T> {
-    fn default() -> Self {
-        Self {
-            init_semaphore: Arc::new(Semaphore::new(1)),
-            value: None,
-        }
-    }
-}
-
-impl<T> OnceCell<T> {
-    /// Creates an already initialized `OnceCell` with the given value.
-    pub fn new(value: T) -> Self {
-        let sem = Semaphore::new(1);
-        sem.close();
-        Self {
-            inner: Mutex::new(Inner {
-                init_semaphore: Arc::new(sem),
-                value: Some(value),
-            }),
-            initializers: AtomicUsize::new(0),
-        }
-    }
-
-    /// Returns a guard to an existing initialized value, or uniquely initializes the value before
-    /// returning the guard.
-    ///
-    /// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
-    ///
-    /// Initialization is panic-safe and cancellation-safe.
-    pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
-    where
-        F: FnOnce(InitPermit) -> Fut,
-        Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
-    {
-        let sem = {
-            let guard = self.inner.lock().unwrap();
-            if guard.value.is_some() {
-                return Ok(Guard(guard));
-            }
-            guard.init_semaphore.clone()
-        };
-
-        let permit = {
-            // increment the count for the duration of queued
-            let _guard = CountWaitingInitializers::start(self);
-            sem.acquire_owned().await
-        };
-
-        match permit {
-            Ok(permit) => {
-                let permit = InitPermit(permit);
-                let (value, _permit) = factory(permit).await?;
-
-                let guard = self.inner.lock().unwrap();
-
-                Ok(Self::set0(value, guard))
-            }
-            Err(_closed) => {
-                let guard = self.inner.lock().unwrap();
-                assert!(
-                    guard.value.is_some(),
-                    "semaphore got closed, must be initialized"
-                );
-                return Ok(Guard(guard));
-            }
-        }
-    }
-
-    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
-    /// to complete initializing the inner value.
-    ///
-    /// # Panics
-    ///
-    /// If the inner has already been initialized.
-    pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
-        let guard = self.inner.lock().unwrap();
-
-        // cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
-        // give more permits right now.
-        if guard.init_semaphore.try_acquire().is_ok() {
-            drop(guard);
-            panic!("permit is of wrong origin");
-        }
-
-        Self::set0(value, guard)
-    }
-
-    fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
-        if guard.value.is_some() {
-            drop(guard);
-            unreachable!("we won permit, must not be initialized");
-        }
-        guard.value = Some(value);
-        guard.init_semaphore.close();
-        Guard(guard)
-    }
-
-    /// Returns a guard to an existing initialized value, if any.
-    pub fn get(&self) -> Option<Guard<'_, T>> {
-        let guard = self.inner.lock().unwrap();
-        if guard.value.is_some() {
-            Some(Guard(guard))
-        } else {
-            None
-        }
-    }
-
-    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
-    pub fn initializer_count(&self) -> usize {
-        self.initializers.load(Ordering::Relaxed)
-    }
-}
-
-/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
-/// initializing task for example at the end of initialization.
-struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
-
-impl<'a, T> CountWaitingInitializers<'a, T> {
-    fn start(target: &'a OnceCell<T>) -> Self {
-        target.initializers.fetch_add(1, Ordering::Relaxed);
-        CountWaitingInitializers(target)
-    }
-}
-
-impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
-    fn drop(&mut self) {
-        self.0.initializers.fetch_sub(1, Ordering::Relaxed);
-    }
-}
-
-/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
-/// initialized value.
-#[derive(Debug)]
-pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
-
-impl<T> std::ops::Deref for Guard<'_, T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-            .value
-            .as_ref()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<T> std::ops::DerefMut for Guard<'_, T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.0
-            .value
-            .as_mut()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<'a, T> Guard<'a, T> {
-    /// Take the current value, and a new permit for it's deinitialization.
-    ///
-    /// The permit will be on a semaphore part of the new internal value, and any following
-    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
-        let mut swapped = Inner::default();
-        let permit = swapped
-            .init_semaphore
-            .clone()
-            .try_acquire_owned()
-            .expect("we just created this");
-        std::mem::swap(&mut *self.0, &mut swapped);
-        swapped
-            .value
-            .map(|v| (v, InitPermit(permit)))
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-/// Type held by OnceCell (de)initializing task.
-pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::{
-        convert::Infallible,
-        sync::atomic::{AtomicUsize, Ordering},
-        time::Duration,
-    };
-
-    #[tokio::test]
-    async fn many_initializers() {
-        #[derive(Default, Debug)]
-        struct Counters {
-            factory_got_to_run: AtomicUsize,
-            future_polled: AtomicUsize,
-            winners: AtomicUsize,
-        }
-
-        let initializers = 100;
-
-        let cell = Arc::new(OnceCell::default());
-        let counters = Arc::new(Counters::default());
-        let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
-
-        let mut js = tokio::task::JoinSet::new();
-        for i in 0..initializers {
-            js.spawn({
-                let cell = cell.clone();
-                let counters = counters.clone();
-                let barrier = barrier.clone();
-
-                async move {
-                    barrier.wait().await;
-                    let won = {
-                        let g = cell
-                            .get_or_init(|permit| {
-                                counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
-                                async {
-                                    counters.future_polled.fetch_add(1, Ordering::Relaxed);
-                                    Ok::<_, Infallible>((i, permit))
-                                }
-                            })
-                            .await
-                            .unwrap();
-
-                        *g == i
-                    };
-
-                    if won {
-                        counters.winners.fetch_add(1, Ordering::Relaxed);
-                    }
-                }
-            });
-        }
-
-        barrier.wait().await;
-
-        while let Some(next) = js.join_next().await {
-            next.expect("no panics expected");
-        }
-
-        let mut counters = Arc::try_unwrap(counters).unwrap();
-
-        assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
-        assert_eq!(*counters.future_polled.get_mut(), 1);
-        assert_eq!(*counters.winners.get_mut(), 1);
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn reinit_waits_for_deinit() {
-        // with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
-        let sleep_for = Duration::from_secs(1);
-        let initial = 42;
-        let reinit = 1;
-        let cell = Arc::new(OnceCell::new(initial));
-
-        let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
-
-        let jh = tokio::spawn({
-            let cell = cell.clone();
-            let deinitialization_started = deinitialization_started.clone();
-            async move {
-                let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
-                assert_eq!(answer, initial);
-
-                deinitialization_started.wait().await;
-                tokio::time::sleep(sleep_for).await;
-            }
-        });
-
-        deinitialization_started.wait().await;
-
-        let started_at = tokio::time::Instant::now();
-        cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
-            .await
-            .unwrap();
-
-        let elapsed = started_at.elapsed();
-        assert!(
-            elapsed >= sleep_for,
-            "initialization should had taken at least the time time slept with permit"
-        );
-
-        jh.await.unwrap();
-
-        assert_eq!(*cell.get().unwrap(), reinit);
-    }
-
-    #[test]
-    fn reinit_with_deinit_permit() {
-        let cell = Arc::new(OnceCell::new(42));
-
-        let (mol, permit) = cell.get().unwrap().take_and_deinit();
-        cell.set(5, permit);
-        assert_eq!(*cell.get().unwrap(), 5);
-
-        let (five, permit) = cell.get().unwrap().take_and_deinit();
-        assert_eq!(5, five);
-        cell.set(mol, permit);
-        assert_eq!(*cell.get().unwrap(), 42);
-    }
-
-    #[tokio::test]
-    async fn initialization_attemptable_until_ok() {
-        let cell = OnceCell::default();
-
-        for _ in 0..10 {
-            cell.get_or_init(|_permit| async { Err("whatever error") })
-                .await
-                .unwrap_err();
-        }
-
-        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
-            .await
-            .unwrap();
-        assert_eq!(*g, "finally success");
-    }
-
-    #[tokio::test]
-    async fn initialization_is_cancellation_safe() {
-        let cell = OnceCell::default();
-
-        let barrier = tokio::sync::Barrier::new(2);
-
-        let initializer = cell.get_or_init(|permit| async {
-            barrier.wait().await;
-            futures::future::pending::<()>().await;
-
-            Ok::<_, Infallible>(("never reached", permit))
-        });
-
-        tokio::select! {
-            _ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
-            _ = barrier.wait() => {}
-        };
-
-        // now initializer is dropped
-
-        assert!(cell.get().is_none());
-
-        let g = cell
-            .get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
-            .await
-            .unwrap();
-        assert_eq!(*g, "now initialized");
-    }
-}
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -21,6 +21,11 @@ pub struct FileCacheState {

 #[derive(Debug)]
 pub struct FileCacheConfig {
+    /// Whether the file cache is *actually* stored in memory (e.g. by writing to
+    /// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
+    /// memory available for the cgroup.
+    pub(crate) in_memory: bool,
+
    /// The size of the file cache, in terms of the size of the resource it consumes
    /// (currently: only memory)
    ///
@@ -54,9 +59,22 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl Default for FileCacheConfig {
-    fn default() -> Self {
+impl FileCacheConfig {
+    pub fn default_in_memory() -> Self {
        Self {
+            in_memory: true,
+            // 75 %
+            resource_multiplier: 0.75,
+            // 640 MiB; (512 + 128)
+            min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
+            // ensure any increase in file cache size is split 90-10 with 10% to other memory
+            spread_factor: 0.1,
+        }
+    }
+
+    pub fn default_on_disk() -> Self {
+        Self {
+            in_memory: false,
            resource_multiplier: 0.75,
            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
            // memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -65,9 +83,7 @@ impl Default for FileCacheConfig {
            spread_factor: 0.1,
        }
    }
-}

-impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -39,6 +39,16 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

+    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
+    /// kernel's page cache), and therefore should not count against available memory.
+    //
+    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
+    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
+    // during the switch away from an in-memory file cache, we had to default to the previous
+    // behavior.
+    #[arg(long)]
+    pub file_cache_on_disk: bool,
+
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -156,7 +156,10 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = FileCacheConfig::default();
+            let config = match args.file_cache_on_disk {
+                true => FileCacheConfig::default_on_disk(),
+                false => FileCacheConfig::default_in_memory(),
+            };

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -184,7 +187,10 @@ impl Runner {
                info!("file cache size actually got set to {actual_size}")
            }

-            file_cache_disk_size = actual_size;
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
+            }
+
            state.filecache = Some(file_cache);
        }

@@ -233,11 +239,17 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_size = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
            .filecache
            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
        if let Some(cgroup) = &self.cgroup {
            let (last_time, last_history) = *cgroup.watcher.borrow();

@@ -261,7 +273,7 @@ impl Runner {

            let new_threshold = self
                .config
-                .cgroup_threshold(usable_system_memory, expected_file_cache_size);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);

            let current = last_history.avg_non_reclaimable;

@@ -288,10 +300,13 @@ impl Runner {
                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }
            let message = format!(
-                "set file cache size to {} MiB",
+                "set file cache size to {} MiB (in memory = {})",
                bytes_to_mebibytes(actual_usage),
+                file_cache.config.in_memory,
            );
            info!("downscale: {message}");
            status.push(message);
@@ -342,7 +357,9 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            file_cache_disk_size = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
+            }

            if actual_usage != expected_usage {
                warn!(
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,12 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
-    signals::Signal, tcp_listener,
+    auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
+    tcp_listener,
 };

 project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);

 const PID_FILE_NAME: &str = "pageserver.pid";

@@ -259,12 +258,11 @@ fn start_pageserver(
    // A changed version string indicates changed software.
    // A changed launch timestamp indicates a pageserver restart.
    info!(
-        "version: {} launch_timestamp: {} build_tag: {}",
+        "version: {} launch_timestamp: {}",
        version(),
-        launch_ts.to_string(),
-        BUILD_TAG,
+        launch_ts.to_string()
    );
-    set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    set_build_info_metric(GIT_VERSION);
    set_launch_timestamp_metric(launch_ts);
    pageserver::preinitialize_metrics();

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,7 +33,8 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
-    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+    TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
+    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
@@ -632,6 +633,11 @@ impl PageServerConf {
        self.tenants_path().join(tenant_id.to_string())
    }

+    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_ATTACHING_MARKER_FILENAME)
+    }
+
    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -3,6 +3,7 @@ use anyhow::Context;
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
+use serde_with::serde_as;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -41,10 +42,13 @@ pub(super) enum Name {
 ///
 /// This is a denormalization done at the MetricsKey const methods; these should not be constructed
 /// elsewhere.
+#[serde_with::serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub(crate) struct MetricsKey {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,

+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,

--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,4 +1,5 @@
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
+use serde_with::serde_as;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -6,9 +7,12 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
 use utils::id::{TenantId, TimelineId};

 /// How the metrics from pageserver are identified.
+#[serde_with::serde_as]
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
 struct Ids {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,
 }
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -57,10 +57,7 @@ impl ControlPlaneClient {

        if let Some(jwt) = &conf.control_plane_api_token {
            let mut headers = hyper::HeaderMap::new();
-            headers.insert(
-                "Authorization",
-                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
-            );
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
            client = client.default_headers(headers);
        }

@@ -147,7 +144,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
        Ok(response
            .tenants
            .into_iter()
-            .map(|t| (t.id, Generation::new(t.gen)))
+            .map(|t| (t.id, Generation::new(t.generation)))
            .collect::<HashMap<_, _>>())
    }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
-use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
@@ -18,6 +17,7 @@ use hex::FromHex;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
+use serde_with::serde_as;
 use thiserror::Error;
 use tokio;
 use tokio_util::sync::CancellationToken;
@@ -214,6 +214,7 @@ where
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";

+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionList {
    /// Serialization version, for future use
@@ -242,6 +243,7 @@ struct DeletionList {
    validated: bool,
 }

+#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionHeader {
    /// Serialization version, for future use
@@ -269,9 +271,7 @@ impl DeletionHeader {
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
            .await
-            .maybe_fatal_err("save deletion header")?;
-
-        Ok(())
+            .map_err(Into::into)
    }
 }

@@ -360,7 +360,6 @@ impl DeletionList {
        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
        VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
            .await
-            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
    }
 }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,8 +34,6 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::virtual_file::on_fatal_io_error;
-use crate::virtual_file::MaybeFatalIo;

 // The number of keys in a DeletionList before we will proactively persist it
 // (without reaching a flush deadline).  This aims to deliver objects of the order
@@ -197,7 +195,7 @@ impl ListWriter {
                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
-                    on_fatal_io_error(&e, "reading deletion header");
+                    Err(anyhow::anyhow!(e))
                }
            }
        }
@@ -218,9 +216,16 @@ impl ListWriter {
        self.pending.sequence = validated_sequence + 1;

        let deletion_directory = self.conf.deletion_prefix();
-        let mut dir = tokio::fs::read_dir(&deletion_directory)
-            .await
-            .fatal_err("read deletion directory");
+        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
+            Ok(d) => d,
+            Err(e) => {
+                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
+
+                // Give up: if we can't read the deletion list directory, we probably can't
+                // write lists into it later, so the queue won't work.
+                return Err(e.into());
+            }
+        };

        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -228,7 +233,7 @@ impl ListWriter {
        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
-        while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
+        while let Some(dentry) = dir.next_entry().await? {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

@@ -241,9 +246,11 @@ impl ListWriter {
                info!("Cleaning up temporary file {dentry_str}");
                let absolute_path =
                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
-                tokio::fs::remove_file(&absolute_path)
-                    .await
-                    .fatal_err("delete temp file");
+                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
+                    // Non-fatal error: we will just leave the file behind but not
+                    // try and load it.
+                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
+                }

                continue;
            }
@@ -283,9 +290,7 @@ impl ListWriter {
        for s in seqs {
            let list_path = self.conf.deletion_list_path(s);

-            let list_bytes = tokio::fs::read(&list_path)
-                .await
-                .fatal_err("read deletion list");
+            let list_bytes = tokio::fs::read(&list_path).await?;

            let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
                Ok(l) => l,
@@ -344,7 +349,7 @@ impl ListWriter {
        info!("Started deletion frontend worker");

        // Synchronous, but we only do it once per process lifetime so it's tolerable
-        if let Err(e) = create_dir_all(self.conf.deletion_prefix()) {
+        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
            tracing::error!(
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -28,7 +28,6 @@ use crate::config::PageServerConf;
 use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::control_plane_client::RetryForeverError;
 use crate::metrics;
-use crate::virtual_file::MaybeFatalIo;

 use super::deleter::DeleterMessage;
 use super::DeletionHeader;
@@ -288,9 +287,16 @@ where
    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
            debug!("Removing deletion list {list_path}");
-            tokio::fs::remove_file(&list_path)
-                .await
-                .fatal_err("remove deletion list");
+
+            if let Err(e) = tokio::fs::remove_file(&list_path).await {
+                // Unexpected: we should have permissions and nothing else should
+                // be touching these files.  We will leave the file behind.  Subsequent
+                // pageservers will try and load it again: hopefully whatever storage
+                // issue (probably permissions) has been fixed by then.
+                tracing::error!("Failed to delete {list_path}: {e:#}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
+                break;
+            }
        }
    }

--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -60,11 +60,7 @@ use utils::serde_percent::Percent;
 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    tenant::{
-        self,
-        storage_layer::{AsLayerDesc, EvictionError, Layer},
-        Timeline,
-    },
+    tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
 };

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -112,7 +108,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                .await;
            Ok(())
        },
@@ -125,7 +121,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
+    storage: GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -149,8 +145,14 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res =
-                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                &storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;

            match res {
                Ok(()) => {}
@@ -181,12 +183,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -270,6 +273,7 @@ struct LayerCount {

 pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
+    storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -326,10 +330,9 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
+    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
-    let mut max_batch_size = 0;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
@@ -346,18 +349,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

-        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
-        // tasks to evict all seen layers until we have evicted enough
-
-        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
-
-        // semaphore will later be used to limit eviction concurrency, and we can express at
-        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
-        // but fail gracefully by not making batches larger.
-        if batch.len() < u32::MAX as usize {
-            batch.push(candidate.layer);
-            max_batch_size = max_batch_size.max(batch.len());
-        }
+        batched
+            .entry(TimelineKey(candidate.timeline))
+            .or_default()
+            .push(candidate.layer);
    }

    let usage_planned = match warned {
@@ -374,101 +369,69 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    // phase2: evict victims batched by timeline

-    let mut js = tokio::task::JoinSet::new();
-
-    // ratelimit to 1k files or any higher max batch size
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
-
+    // After the loop, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();
    for (timeline, batch) in batched {
        let tenant_id = timeline.tenant_id;
        let timeline_id = timeline.timeline_id;
-        let batch_size =
-            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
-
-        // I dislike naming of `available_permits` but it means current total amount of permits
-        // because permits can be added
-        assert!(batch_size as usize <= limit.available_permits());
+        let batch_size = batch.len();

        debug!(%timeline_id, "evicting batch for timeline");

-        let evict = {
-            let limit = limit.clone();
-            let cancel = cancel.clone();
-            async move {
-                let mut evicted_bytes = 0;
-                let mut evictions_failed = LayerCount::default();
+        async {
+            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;

-                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
-                    // semaphore closing means cancelled
-                    return (evicted_bytes, evictions_failed);
-                };
-
-                let results = timeline.evict_layers(&batch, &cancel).await;
-
-                match results {
-                    Ok(results) => {
-                        assert_eq!(results.len(), batch.len());
-                        for (result, layer) in results.into_iter().zip(batch.iter()) {
-                            let file_size = layer.layer_desc().file_size;
-                            match result {
-                                Some(Ok(())) => {
-                                    evicted_bytes += file_size;
-                                }
-                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                                    evictions_failed.file_sizes += file_size;
-                                    evictions_failed.count += 1;
-                                }
-                                None => {
-                                    assert!(cancel.is_cancelled());
-                                }
+            match results {
+                Err(e) => {
+                    warn!("failed to evict batch: {:#}", e);
+                }
+                Ok(results) => {
+                    assert_eq!(results.len(), batch.len());
+                    for (result, layer) in results.into_iter().zip(batch.iter()) {
+                        let file_size = layer.layer_desc().file_size;
+                        match result {
+                            Some(Ok(())) => {
+                                usage_assumed.add_available_bytes(file_size);
+                            }
+                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
+                            }
+                            Some(Err(EvictionError::FileNotFound)) => {
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            Some(Err(
+                                e @ EvictionError::LayerNotFound(_)
+                                | e @ EvictionError::StatFailed(_),
+                            )) => {
+                                let e = utils::error::report_compact_sources(&e);
+                                warn!(%layer, "failed to evict layer: {e}");
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            Some(Err(EvictionError::MetadataInconsistency(detail))) => {
+                                warn!(%layer, "failed to evict layer: {detail}");
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
+                            None => {
+                                assert!(cancel.is_cancelled());
+                                return;
                            }
                        }
                    }
-                    Err(e) => {
-                        warn!("failed to evict batch: {:#}", e);
-                    }
                }
-                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
+        .await;

-        js.spawn(evict);
-
-        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
-        // chance of making progress
-        tokio::task::yield_now().await;
-    }
-
-    let join_all = async move {
-        // After the evictions, `usage_assumed` is the post-eviction usage,
-        // according to internal accounting.
-        let mut usage_assumed = usage_pre;
-        let mut evictions_failed = LayerCount::default();
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok((evicted_bytes, failed)) => {
-                    usage_assumed.add_available_bytes(evicted_bytes);
-                    evictions_failed.file_sizes += failed.file_sizes;
-                    evictions_failed.count += failed.count;
-                }
-                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                Err(je) if je.is_panic() => { /* already logged */ }
-                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
-            }
-        }
-        (usage_assumed, evictions_failed)
-    };
-
-    let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = join_all => { tuple },
-        _ = cancel.cancelled() => {
-            // close the semaphore to stop any pending acquires
-            limit.close();
+        if cancel.is_cancelled() {
            return Ok(IterationOutcome::Cancelled);
        }
-    };
+    }

    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
        before: usage_pre,
@@ -483,7 +446,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 #[derive(Clone)]
 struct EvictionCandidate {
    timeline: Arc<Timeline>,
-    layer: Layer,
+    layer: Arc<dyn PersistentLayer>,
    last_activity_ts: SystemTime,
 }

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -392,19 +392,13 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
-        - name: version
-          in: query
-          required: false
-          schema:
-            type: integer
-          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
-                $ref: "#/components/schemas/LsnByTimestampResponse"
+                type: string
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
@@ -569,17 +563,7 @@ paths:
              schema:
                $ref: "#/components/schemas/NotFoundError"
        "409":
-          description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
+          description: Tenant download is already in progress
          content:
            application/json:
              schema:
@@ -723,12 +707,6 @@ paths:

        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TenantLoadRequest"
      responses:
        "202":
          description: Tenant scheduled to load successfully
@@ -1219,15 +1197,6 @@ components:
            new_tenant_id:
              type: string
              format: hex
-            generation:
-              type: integer
-              description: Attachment generation number.
-    TenantLoadRequest:
-      type: object
-      properties:
-        generation:
-          type: integer
-          description: Attachment generation number.
    TenantAttachRequest:
      type: object
      required:
@@ -1415,19 +1384,6 @@ components:
          type: string
          format: hex

-    LsnByTimestampResponse:
-      type: object
-      required:
-        - lsn
-        - kind
-      properties:
-        lsn:
-          type: string
-          format: hex
-        kind:
-          type: string
-          enum: [past, present, future, nodata]
-
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -8,7 +8,7 @@ use std::sync::Arc;
 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
-use hyper::header;
+use hyper::header::CONTENT_TYPE;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -484,8 +484,6 @@ async fn get_lsn_by_timestamp_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let version: Option<u8> = parse_query_param(&request, "version")?;
-
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -497,30 +495,13 @@ async fn get_lsn_by_timestamp_handler(
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

-    if version.unwrap_or(0) > 1 {
-        #[derive(serde::Serialize)]
-        struct Result {
-            lsn: Lsn,
-            kind: &'static str,
-        }
-        let (lsn, kind) = match result {
-            LsnForTimestamp::Present(lsn) => (lsn, "present"),
-            LsnForTimestamp::Future(lsn) => (lsn, "future"),
-            LsnForTimestamp::Past(lsn) => (lsn, "past"),
-            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
-        };
-        json_response(StatusCode::OK, Result { lsn, kind })
-    } else {
-        // FIXME: this is a temporary crutch not to break backwards compatibility
-        // See https://github.com/neondatabase/neon/pull/5608
-        let result = match result {
-            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-            LsnForTimestamp::Future(_lsn) => "future".into(),
-            LsnForTimestamp::Past(_lsn) => "past".into(),
-            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-        };
-        json_response(StatusCode::OK, result)
-    }
+    let result = match result {
+        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+        LsnForTimestamp::Future(_lsn) => "future".into(),
+        LsnForTimestamp::Past(_lsn) => "past".into(),
+        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+    };
+    json_response(StatusCode::OK, result)
 }

 async fn get_timestamp_of_lsn_handler(
@@ -786,10 +767,6 @@ async fn tenant_size_handler(
        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
-    let accepts_html = headers
-        .get(header::ACCEPT)
-        .map(|v| v == "text/html")
-        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
        let storage_model = inputs
            .calculate_model()
@@ -797,19 +774,21 @@ async fn tenant_size_handler(
        let size = storage_model.calculate();

        // If request header expects html, return html
-        if accepts_html {
+        if headers["Accept"] == "text/html" {
            return synthetic_size_html_response(inputs, storage_model, size);
        }
        sizes = Some(size);
-    } else if accepts_html {
+    } else if headers["Accept"] == "text/html" {
        return Err(ApiError::BadRequest(anyhow!(
            "inputs_only parameter is incompatible with html output request"
        )));
    }

    /// The type resides in the pageserver not to expose `ModelInputs`.
+    #[serde_with::serde_as]
    #[derive(serde::Serialize)]
    struct TenantHistorySize {
+        #[serde_as(as = "serde_with::DisplayFromStr")]
        id: TenantId,
        /// Size is a mixture of WAL and logical size, so the unit is bytes.
        ///
@@ -950,7 +929,7 @@ fn synthetic_size_html_response(
 pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
    let response = Response::builder()
        .status(status)
-        .header(header::CONTENT_TYPE, "text/html")
+        .header(hyper::header::CONTENT_TYPE, "text/html")
        .body(Body::from(data.as_bytes().to_vec()))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
@@ -1200,7 +1179,7 @@ async fn timeline_compact_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
@@ -1225,7 +1204,7 @@ async fn timeline_checkpoint_handler(
        timeline
            .compact(&cancel, &ctx)
            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .map_err(ApiError::InternalServerError)?;

        json_response(StatusCode::OK, ())
    }
@@ -1331,7 +1310,7 @@ async fn getpage_at_lsn_handler(
        Result::<_, ApiError>::Ok(
            Response::builder()
                .status(StatusCode::OK)
-                .header(header::CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_TYPE, "application/octet-stream")
                .body(hyper::Body::from(page))
                .unwrap(),
        )
@@ -1495,11 +1474,11 @@ async fn disk_usage_eviction_run(

    let state = get_state(&r);

-    if state.remote_storage.as_ref().is_none() {
+    let Some(storage) = state.remote_storage.clone() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    }
+    };

    let state = state.disk_usage_eviction_state.clone();

@@ -1517,6 +1496,7 @@ async fn disk_usage_eviction_run(
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
+                &storage,
                usage,
                &child_cancel,
            )
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -149,10 +149,6 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    }
 }

-// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
-// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
-// from the name.
-
 pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1388,23 +1388,28 @@ impl TimelineMetrics {
        }
    }

-    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
+    pub fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }

-    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
+    pub fn resident_physical_size_sub(&self, sz: u64) {
        self.resident_physical_size_gauge.sub(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    }

-    pub(crate) fn resident_physical_size_add(&self, sz: u64) {
+    pub fn resident_physical_size_add(&self, sz: u64) {
        self.resident_physical_size_gauge.add(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }

-    pub(crate) fn resident_physical_size_get(&self) -> u64 {
+    pub fn resident_physical_size_set(&self, sz: u64) {
+        self.resident_physical_size_gauge.set(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
+    }
+
+    pub fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -552,8 +552,7 @@ impl Timeline {
                Err(e) => Err(PageReconstructError::from(e)),
            },
            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
+                warn!("Failed to get info about AUX files: {}", e);
                Ok(HashMap::new())
            }
        }
@@ -676,9 +675,8 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
-            result.add_key(AUX_FILES_KEY);
-        }
+        result.add_key(AUX_FILES_KEY);
+
        Ok(result.to_keyspace())
    }

@@ -1203,8 +1201,7 @@ impl<'a> DatadirModification<'a> {
        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
            Ok(buf) => AuxFilesDirectory::des(&buf)?,
            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
+                warn!("Failed to get info about AUX files: {}", e);
                AuxFilesDirectory {
                    files: HashMap::new(),
                }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,10 +3,10 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, warn, Instrument, Span};
+use tracing::{error, info, instrument, warn, Instrument, Span};

 use utils::{
    backoff, completion, crashsafe, fs_ext,
@@ -25,9 +25,11 @@ use super::{
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
 };

+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
@@ -58,7 +60,7 @@ fn remote_tenant_delete_mark_path(
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
+    Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
 }

 async fn create_remote_delete_mark(
@@ -148,8 +150,7 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
    // Assert timelines dir is empty.
    if !fs_ext::is_directory_empty(timelines_path).await? {
        // Display first 10 items in directory
-        let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
-        let list = &list.into_iter().take(10).collect::<Vec<_>>();
+        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
        return Err(DeleteTenantError::Other(anyhow::anyhow!(
            "Timelines directory is not empty after all timelines deletion: {list:?}"
        )));
@@ -238,6 +239,32 @@ async fn cleanup_remaining_fs_traces(
    Ok(())
 }

+pub(crate) async fn remote_delete_mark_exists(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+    remote_storage: &GenericRemoteStorage,
+) -> anyhow::Result<bool> {
+    // If remote storage is there we rely on it
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
+
+    let result = backoff::retry(
+        || async { remote_storage.download(&remote_mark_path).await },
+        |e| matches!(e, DownloadError::NotFound),
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        "fetch_tenant_deletion_mark",
+        // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+        backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+    )
+    .await;
+
+    match result {
+        Ok(_) => Ok(true),
+        Err(DownloadError::NotFound) => Ok(false),
+        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
+    }
+}
+
 /// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -249,9 +276,10 @@ async fn cleanup_remaining_fs_traces(
 /// 6. Remove remote mark
 /// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
 /// It is resumable from any step in case a crash/restart occurs.
-/// There are two entrypoints to the process:
+/// There are three entrypoints to the process:
 /// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
+/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
 ///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
 #[derive(Default)]
 pub enum DeleteTenantFlow {
@@ -350,7 +378,7 @@ impl DeleteTenantFlow {

    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
-        remote_mark_exists: bool,
+        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
        let acquire = |t: &Tenant| {
@@ -361,25 +389,66 @@ impl DeleteTenantFlow {
            )
        };

-        if remote_mark_exists {
-            return Ok(acquire(tenant));
-        }
-
        let tenant_id = tenant.tenant_id;
        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+            return Ok(acquire(tenant));
+        }
+
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };
+
+        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
            Ok(acquire(tenant))
        } else {
            Ok(None)
        }
    }

+    pub(crate) async fn resume_from_load(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, true, false)
+            .await
+            .expect("cant be stopping or broken");
+
+        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
+        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+        if let Some(background) = background_jobs_can_start {
+            info!("waiting for backgound jobs barrier");
+            background.clone().wait().await;
+            info!("ready for backgound jobs barrier");
+        }
+
+        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
+        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
+        if timelines_path.exists() {
+            tenant.load(init_order, None, ctx).await.context("load")?;
+        }
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
    pub(crate) async fn resume_from_attach(
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
-        preload: Option<TenantPreload>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -390,7 +459,7 @@ impl DeleteTenantFlow {
            .expect("cant be stopping or broken");

        tenant
-            .attach(init_order, preload, ctx)
+            .attach(ctx, super::AttachMarkerMode::Expect)
            .await
            .context("attach")?;

--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -639,10 +639,147 @@ impl LayerMap {
        }

        println!("historic_layers:");
-        for desc in self.iter_historic_layers() {
-            desc.dump();
+        for layer in self.iter_historic_layers() {
+            layer.dump(verbose, ctx)?;
        }
        println!("End dump LayerMap");
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::LayerMap;
+    use crate::tenant::storage_layer::LayerFileName;
+    use std::str::FromStr;
+    use std::sync::Arc;
+
+    mod l0_delta_layers_updated {
+
+        use crate::tenant::{
+            storage_layer::{AsLayerDesc, PersistentLayerDesc},
+            timeline::layer_manager::LayerFileManager,
+        };
+
+        use super::*;
+
+        struct LayerObject(PersistentLayerDesc);
+
+        impl AsLayerDesc for LayerObject {
+            fn layer_desc(&self) -> &PersistentLayerDesc {
+                &self.0
+            }
+        }
+
+        impl LayerObject {
+            fn new(desc: PersistentLayerDesc) -> Self {
+                LayerObject(desc)
+            }
+        }
+
+        type TestLayerFileManager = LayerFileManager<LayerObject>;
+
+        #[test]
+        fn for_full_range_delta() {
+            // l0_delta_layers are used by compaction, and should observe all buffered updates
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69",
+                 true
+             )
+        }
+
+        #[test]
+        fn for_non_full_range_delta() {
+            // has minimal uncovered areas compared to l0_delta_layers_updated_on_insert_replace_remove_for_full_range_delta
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000001-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE__0000000053423C21-0000000053424D69",
+                 // because not full range
+                 false
+             )
+        }
+
+        #[test]
+        fn for_image() {
+            l0_delta_layers_updated_scenario(
+                 "000000000000000000000000000000000000-000000000000000000000000000000010000__0000000053424D69",
+                 // code only checks if it is a full range layer, doesn't care about images, which must
+                 // mean we should in practice never have full range images
+                 false
+             )
+        }
+
+        #[test]
+        fn replacing_missing_l0_is_notfound() {
+            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
+            // however only happen for precondition failures.
+
+            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
+            let layer = LayerFileName::from_str(layer).unwrap();
+            let layer = PersistentLayerDesc::from(layer);
+
+            // same skeletan construction; see scenario below
+            let not_found = Arc::new(LayerObject::new(layer.clone()));
+            let new_version = Arc::new(LayerObject::new(layer));
+
+            // after the immutable storage state refactor, the replace operation
+            // will not use layer map any more. We keep it here for consistency in test cases
+            // and can remove it in the future.
+            let _map = LayerMap::default();
+
+            let mut mapping = TestLayerFileManager::new();
+
+            mapping
+                .replace_and_verify(not_found, new_version)
+                .unwrap_err();
+        }
+
+        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
+            let name = LayerFileName::from_str(layer_name).unwrap();
+            let skeleton = PersistentLayerDesc::from(name);
+
+            let remote = Arc::new(LayerObject::new(skeleton.clone()));
+            let downloaded = Arc::new(LayerObject::new(skeleton));
+
+            let mut map = LayerMap::default();
+            let mut mapping = LayerFileManager::new();
+
+            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
+            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
+            assert_eq!(remote.layer_desc(), downloaded.layer_desc());
+
+            let expected_in_counts = (1, usize::from(expected_l0));
+
+            map.batch_update()
+                .insert_historic(remote.layer_desc().clone());
+            mapping.insert(remote.clone());
+            assert_eq!(
+                count_layer_in(&map, remote.layer_desc()),
+                expected_in_counts
+            );
+
+            mapping
+                .replace_and_verify(remote, downloaded.clone())
+                .expect("name derived attributes are the same");
+            assert_eq!(
+                count_layer_in(&map, downloaded.layer_desc()),
+                expected_in_counts
+            );
+
+            map.batch_update().remove_historic(downloaded.layer_desc());
+            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
+        }
+
+        fn count_layer_in(map: &LayerMap, layer: &PersistentLayerDesc) -> (usize, usize) {
+            let historic = map
+                .iter_historic_layers()
+                .filter(|x| x.key() == layer.key())
+                .count();
+            let l0s = map
+                .get_level0_deltas()
+                .expect("why does this return a result");
+            let l0 = l0s.iter().filter(|x| x.key() == layer.key()).count();
+
+            (historic, l0)
+        }
+    }
+}
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -406,123 +406,4 @@ mod tests {
            METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
        );
    }
-
-    #[test]
-    fn test_metadata_bincode_serde() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let metadata_bytes = original_metadata
-            .to_bytes()
-            .expect("Cannot create bytes array from metadata");
-
-        let metadata_bincode_be_bytes = original_metadata
-            .ser()
-            .expect("Cannot serialize the metadata");
-
-        // 8 bytes for the length of the vector
-        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
-
-        let expected_bincode_bytes = {
-            let mut temp = vec![];
-            let len_bytes = metadata_bytes.len().to_be_bytes();
-            temp.extend_from_slice(&len_bytes);
-            temp.extend_from_slice(&metadata_bytes);
-            temp
-        };
-        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
-
-        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
-        // Deserialized metadata has the metadata header, which is different from the serialized one.
-        //   Reference: TimelineMetaData::to_bytes()
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        assert_eq!(deserialized_metadata, expected_metadata);
-    }
-
-    #[test]
-    fn test_metadata_bincode_serde_ensure_roundtrip() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let expected_bytes = vec![
-            /* bincode length encoding bytes */
-            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
-            /* TimelineMetadataHeader */
-            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
-            /* TimelineMetadataBodyV2 */
-            0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
-            1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
-            1, 17, 34, 51, 68, 85, 102, 119, 136, 17, 34, 51, 68, 85, 102, 119,
-            136, // ancestor_timeline (17 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
-            0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
-            0, 0, 0, 15, // pg_version (4 bytes)
-            /* padding bytes */
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0,
-        ];
-        let metadata_ser_bytes = original_metadata.ser().unwrap();
-        assert_eq!(metadata_ser_bytes, expected_bytes);
-
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
-        assert_eq!(des_metadata, expected_metadata);
-    }
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,7 +26,10 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
-use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{
+    create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
+    TenantState,
+};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -434,15 +437,14 @@ pub async fn init_tenant_mgr(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;

-        match tenant_spawn(
+        match schedule_local_tenant_processing(
            conf,
            tenant_id,
            &tenant_dir_path,
-            resources.clone(),
            AttachedTenantConf::try_from(location_conf)?,
+            resources.clone(),
            Some(init_order.clone()),
            &TENANTS,
-            SpawnMode::Normal,
            &ctx,
        ) {
            Ok(tenant) => {
@@ -462,18 +464,15 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

-/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
-/// a broken tenant in the map if Tenant::spawn fails.
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn tenant_spawn(
+pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
    tenant_path: &Utf8Path,
-    resources: TenantSharedResources,
    location_conf: AttachedTenantConf,
+    resources: TenantSharedResources,
    init_order: Option<InitializationOrder>,
    tenants: &'static tokio::sync::RwLock<TenantsMap>,
-    mode: SpawnMode,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -497,24 +496,45 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!("Attaching tenant {tenant_id}");
-    let tenant = match Tenant::spawn(
-        conf,
-        tenant_id,
-        resources,
-        location_conf,
-        init_order,
-        tenants,
-        mode,
-        ctx,
-    ) {
-        Ok(tenant) => tenant,
-        Err(e) => {
-            error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
+        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
+        if resources.remote_storage.is_none() {
+            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
+            Tenant::create_broken_tenant(
+                conf,
+                tenant_id,
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
+        } else {
+            match Tenant::spawn_attach(
+                conf,
+                tenant_id,
+                resources,
+                location_conf,
+                tenants,
+                AttachMarkerMode::Expect,
+                ctx,
+            ) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
        }
+    } else {
+        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
+        // Start loading the tenant into memory. It will initially be in Loading state.
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            location_conf,
+            resources,
+            init_order,
+            tenants,
+            ctx,
+        )
    };
-
    Ok(tenant)
 }

@@ -650,41 +670,29 @@ pub(crate) async fn create_tenant(
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
+
        let location_conf = LocationConf::attached_single(tenant_conf, generation);

        // We're holding the tenants lock in write mode while doing local IO.
        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
        // and do the work in that state.
-        super::create_tenant_files(conf, &location_conf, &tenant_id).await?;
-
+        let tenant_directory = super::create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        let tenant_path = conf.tenant_path(&tenant_id);
-
-        let created_tenant = tenant_spawn(
-            conf,
-            tenant_id,
-            &tenant_path,
-            resources,
-            AttachedTenantConf::try_from(location_conf)?,
-            None,
-            &TENANTS,
-            SpawnMode::Create,
-            ctx,
-        )?;
+        let created_tenant =
+            schedule_local_tenant_processing(conf, tenant_id, &tenant_directory,
+                AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

        let crated_tenant_id = created_tenant.tenant_id();
        anyhow::ensure!(
-            tenant_id == crated_tenant_id,
-            "loaded created tenant has unexpected tenant id \
-                (expect {tenant_id} != actual {crated_tenant_id})",
-        );
+                tenant_id == crated_tenant_id,
+                "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
+            );
        Ok(created_tenant)
-    })
-    .await
+    }).await
 }

 #[derive(Debug, thiserror::Error)]
@@ -793,10 +801,9 @@ pub(crate) async fn upsert_location(
                }
            }

-            let tenant_path = conf.tenant_path(&tenant_id);
-
            let new_slot = match &new_location_config.mode {
                LocationMode::Secondary(_) => {
+                    let tenant_path = conf.tenant_path(&tenant_id);
                    // Directory doesn't need to be fsync'd because if we crash it can
                    // safely be recreated next time this tenant location is configured.
                    unsafe_create_dir_all(&tenant_path)
@@ -826,21 +833,28 @@ pub(crate) async fn upsert_location(
                        .await
                        .map_err(SetNewTenantConfigError::Persist)?;

-                    let tenant = tenant_spawn(
+                    let tenant = match Tenant::spawn_attach(
                        conf,
                        tenant_id,
-                        &tenant_path,
                        TenantSharedResources {
                            broker_client,
                            remote_storage,
                            deletion_queue_client,
                        },
                        AttachedTenantConf::try_from(new_location_config)?,
-                        None,
                        &TENANTS,
-                        SpawnMode::Normal,
+                        // The LocationConf API does not use marker files, because we have Secondary
+                        // locations where the directory's existence is not a signal that it contains
+                        // all timelines.  See https://github.com/neondatabase/neon/issues/5550
+                        AttachMarkerMode::Ignore,
                        ctx,
-                    )?;
+                    ) {
+                        Ok(tenant) => tenant,
+                        Err(e) => {
+                            error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                        }
+                    };

                    TenantSlot::Attached(tenant)
                }
@@ -1029,7 +1043,7 @@ pub(crate) async fn load_tenant(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;

-        let new_tenant = tenant_spawn(conf, tenant_id, &tenant_path, resources, AttachedTenantConf::try_from(location_conf)?, None,  &TENANTS, SpawnMode::Normal, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_path, AttachedTenantConf::try_from(location_conf)?, resources, None,  &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -1103,12 +1117,18 @@ pub(crate) async fn attach_tenant(
 ) -> Result<(), TenantMapInsertError> {
    tenant_map_insert(tenant_id, || async {
        let location_conf = LocationConf::attached_single(tenant_conf, generation);
-        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
+        let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

-        let attached_tenant = tenant_spawn(conf, tenant_id, &tenant_dir,
-            resources, AttachedTenantConf::try_from(location_conf)?, None, &TENANTS, SpawnMode::Normal, ctx)?;
+        // Without the attach marker, schedule_local_tenant_processing will treat the attached tenant as fully attached
+        let marker_file_exists = conf
+            .tenant_attaching_mark_file_path(&tenant_id)
+            .try_exists()
+            .context("check for attach marker file existence")?;
+        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
+
+        let attached_tenant = schedule_local_tenant_processing(conf, tenant_id, &tenant_dir, AttachedTenantConf::try_from(location_conf)?, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -57,7 +57,8 @@ pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
    fsync_in_thread_pool(paths)
 }

-/// Parallel fsync asynchronously.
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
+/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
 pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
    const MAX_CONCURRENT_FSYNC: usize = 64;
    let mut next = paths.iter().peekable();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -167,15 +167,39 @@
 //!   - download their remote [`IndexPart`]s
 //!   - create `Timeline` struct and a `RemoteTimelineClient`
 //!   - initialize the client's upload queue with its `IndexPart`
+//!   - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
+//!     for layers that are referenced by `IndexPart` but not present locally
 //!   - schedule uploads for layers that are only present locally.
+//!   - if the remote `IndexPart`'s metadata was newer than the metadata in
+//!     the local filesystem, write the remote metadata to the local filesystem
 //! - After the above is done for each timeline, open the tenant for business by
 //!   transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
 //!   This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
 //!
+//! We keep track of the fact that a client is in `Attaching` state in a marker
+//! file on the local disk. This is critical because, when we restart the pageserver,
+//! we do not want to do the `List timelines` step for each tenant that has already
+//! been successfully attached (for performance & cost reasons).
+//! Instead, for a tenant without the attach marker file, we assume that the
+//! local state is in sync or ahead of the remote state. This includes the list
+//! of all of the tenant's timelines, which is particularly critical to be up-to-date:
+//! if there's a timeline on the remote that the pageserver doesn't know about,
+//! the GC will not consider its branch point, leading to data loss.
+//! So, for a tenant with the attach marker file, we know that we do not yet have
+//! persisted all the remote timeline's metadata files locally. To exclude the
+//! risk above, we re-run the procedure for such tenants
+//!
 //! # Operating Without Remote Storage
 //!
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
+//! Theoretically, it should be ok to remove and re-add remote storage configuration to
+//! the pageserver config at any time, since it doesn't make a difference to
+//! [`Timeline::load_layer_map`].
+//! Of course, the remote timeline dir must not change while we have de-configured
+//! remote storage, i.e., the pageserver must remain the owner of the given prefix
+//! in remote storage.
+//! But note that we don't test any of this right now.
 //!
 //! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
@@ -187,7 +211,8 @@ mod upload;
 use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
-
+// re-export these
+pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -212,7 +237,7 @@ use crate::metrics::{
 };
 use crate::task_mgr::shutdown_token;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::storage_layer::AsLayerDesc;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::upload_queue::Delete;
 use crate::tenant::TIMELINES_SEGMENT_NAME;
 use crate::{
@@ -230,13 +255,10 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
+use super::storage_layer::LayerFileName;
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

-pub(crate) use download::{is_temp_download_file, list_remote_timelines};
-pub(crate) use index::LayerFileMetadata;
-
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
@@ -446,10 +468,7 @@ impl RemoteTimelineClient {
    //

    /// Download index file
-    pub async fn download_index_file(
-        &self,
-        cancel: CancellationToken,
-    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
+    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
        let _unfinished_gauge_guard = self.metrics.call_begin(
            &RemoteOpFileKind::Index,
            &RemoteOpKind::Download,
@@ -463,7 +482,6 @@ impl RemoteTimelineClient {
            &self.tenant_id,
            &self.timeline_id,
            self.generation,
-            cancel,
        )
        .measure_remote_op(
            self.tenant_id,
@@ -609,203 +627,101 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub(crate) fn schedule_layer_file_upload(
+    pub fn schedule_layer_file_upload(
        self: &Arc<Self>,
-        layer: ResidentLayer,
+        layer_file_name: &LayerFileName,
+        layer_metadata: &LayerFileMetadata,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        self.schedule_layer_file_upload0(upload_queue, layer);
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    fn schedule_layer_file_upload0(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        layer: ResidentLayer,
-    ) {
-        let metadata = layer.metadata();
-
        upload_queue
            .latest_files
-            .insert(layer.layer_desc().filename(), metadata.clone());
+            .insert(layer_file_name.clone(), layer_metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        info!("scheduled layer file upload {layer}");
-        let op = UploadOp::UploadLayer(layer, metadata);
+        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
+
+        info!("scheduled layer file upload {layer_file_name}");
+
+        // Launch the task immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
    }

    /// Launch a delete operation in the background.
    ///
-    /// The operation does not modify local filesystem state.
+    /// The operation does not modify local state but assumes the local files have already been
+    /// deleted, and is used to mirror those changes to remote.
    ///
    /// Note: This schedules an index file upload before the deletions.  The
-    /// deletion won't actually be performed, until all previously scheduled
+    /// deletion won't actually be performed, until any previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: Vec<LayerFileName>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        let with_generations =
-            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
-
-        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
-
-        // Launch the tasks immediately, if possible
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    /// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
-    /// layer files, leaving them dangling.
-    ///
-    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
-    /// is invoked on them.
-    pub(crate) fn schedule_gc_update(self: &Arc<Self>, gc_layers: &[Layer]) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        // just forget the return value; after uploading the next index_part.json, we can consider
-        // the layer files as "dangling". this is fine, at worst case we create work for the
-        // scrubber.
-
-        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
-
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
-
-        self.launch_queued_tasks(upload_queue);
-
-        Ok(())
-    }
-
-    /// Update the remote index file, removing the to-be-deleted files from the index,
-    /// allowing scheduling of actual deletions later.
-    fn schedule_unlinking_of_layers_from_index_part0<I>(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        names: I,
-    ) -> Vec<(LayerFileName, Generation)>
-    where
-        I: IntoIterator<Item = LayerFileName>,
-    {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();

-        // Decorate our list of names with each name's generation, dropping
-        // names that are unexpectedly missing from our metadata.
-        let with_generations: Vec<_> = names
-            .into_iter()
-            .filter_map(|name| {
-                let meta = upload_queue.latest_files.remove(&name);
+        // Update the remote index file, removing the to-be-deleted files from the index,
+        // before deleting the actual files.
+        //
+        // Once we start removing files from upload_queue.latest_files, there's
+        // no going back! Otherwise, some of the files would already be removed
+        // from latest_files, but not yet scheduled for deletion. Use a closure
+        // to syntactically forbid ? or bail! calls here.
+        let no_bail_here = || {
+            // Decorate our list of names with each name's generation, dropping
+            // makes that are unexpectedly missing from our metadata.
+            let with_generations: Vec<_> = names
+                .into_iter()
+                .filter_map(|name| {
+                    // Remove from latest_files, learning the file's remote generation in the process
+                    let meta = upload_queue.latest_files.remove(&name);

-                if let Some(meta) = meta {
-                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                    Some((name, meta.generation))
-                } else {
-                    // This can only happen if we forgot to to schedule the file upload
-                    // before scheduling the delete. Log it because it is a rare/strange
-                    // situation, and in case something is misbehaving, we'd like to know which
-                    // layers experienced this.
-                    info!("Deleting layer {name} not found in latest_files list, never uploaded?");
-                    None
-                }
-            })
-            .collect();
+                    if let Some(meta) = meta {
+                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                        Some((name, meta.generation))
+                    } else {
+                        // This can only happen if we forgot to to schedule the file upload
+                        // before scheduling the delete. Log it because it is a rare/strange
+                        // situation, and in case something is misbehaving, we'd like to know which
+                        // layers experienced this.
+                        info!(
+                            "Deleting layer {name} not found in latest_files list, never uploaded?"
+                        );
+                        None
+                    }
+                })
+                .collect();

-        #[cfg(feature = "testing")]
-        for (name, gen) in &with_generations {
-            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
-                if &unexpected == gen {
-                    tracing::error!("{name} was unlinked twice with same generation");
-                } else {
-                    tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
-                }
+            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+                self.schedule_index_upload(upload_queue, metadata);
            }
-        }

-        // after unlinking files from the upload_queue.latest_files we must always schedule an
-        // index_part update, because that needs to be uploaded before we can actually delete the
-        // files.
-        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue, metadata);
-        }
-
-        with_generations
-    }
-
-    /// Schedules deletion for layer files which have previously been unlinked from the
-    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
-    pub(crate) fn schedule_deletion_of_unlinked(
-        self: &Arc<Self>,
-        layers: Vec<(LayerFileName, Generation)>,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        self.schedule_deletion_of_unlinked0(upload_queue, layers);
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
-    }
-
-    fn schedule_deletion_of_unlinked0(
-        self: &Arc<Self>,
-        upload_queue: &mut UploadQueueInitialized,
-        with_generations: Vec<(LayerFileName, Generation)>,
-    ) {
-        for (name, gen) in &with_generations {
-            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
-        }
-
-        #[cfg(feature = "testing")]
-        for (name, gen) in &with_generations {
-            match upload_queue.dangling_files.remove(name) {
-                Some(same) if &same == gen => { /* expected */ }
-                Some(other) => {
-                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
-                }
-                None => {
-                    tracing::error!("{name} was unlinked but was not dangling");
-                }
+            for (name, gen) in &with_generations {
+                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
            }
-        }

-        // schedule the actual deletions
-        let op = UploadOp::Delete(Delete {
-            layers: with_generations,
-        });
-        self.calls_unfinished_metric_begin(&op);
-        upload_queue.queued_operations.push_back(op);
-    }
-
-    /// Schedules a compaction update to the remote `index_part.json`.
-    ///
-    /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
-    pub(crate) fn schedule_compaction_update(
-        self: &Arc<Self>,
-        compacted_from: &[Layer],
-        compacted_to: &[ResidentLayer],
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        for layer in compacted_to {
-            self.schedule_layer_file_upload0(upload_queue, layer.clone());
-        }
-
-        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
-
-        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
-        self.launch_queued_tasks(upload_queue);
+            // schedule the actual deletions
+            let op = UploadOp::Delete(Delete {
+                layers: with_generations,
+            });
+            self.calls_unfinished_metric_begin(&op);
+            upload_queue.queued_operations.push_back(op);

+            // Launch the tasks immediately, if possible
+            self.launch_queued_tasks(upload_queue);
+        };
+        no_bail_here();
        Ok(())
    }

@@ -1177,12 +1093,16 @@ impl RemoteTimelineClient {
            }

            let upload_result: anyhow::Result<()> = match &task.op {
-                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
-                    let path = layer.local_path();
+                UploadOp::UploadLayer(ref layer_file_name, ref layer_metadata) => {
+                    let path = self
+                        .conf
+                        .timeline_path(&self.tenant_id, &self.timeline_id)
+                        .join(layer_file_name.file_name());
+
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        path,
+                        &path,
                        layer_metadata,
                        self.generation,
                    )
@@ -1456,8 +1376,6 @@ impl RemoteTimelineClient {
                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
-                        #[cfg(feature = "testing")]
-                        dangling_files: HashMap::default(),
                    };

                    let upload_queue = std::mem::replace(
@@ -1501,6 +1419,13 @@ impl RemoteTimelineClient {
            }
        }
    }
+
+    pub(crate) fn get_layer_metadata(
+        &self,
+        name: &LayerFileName,
+    ) -> anyhow::Result<Option<LayerFileMetadata>> {
+        self.upload_queue.lock().unwrap().get_layer_metadata(name)
+    }
 }

 pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
@@ -1542,7 +1467,7 @@ pub fn remote_index_path(
 }

 /// Given the key of an index, parse out the generation part of the name
-pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
+pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    let file_name = match path.get_path().file_name() {
        Some(f) => f,
        None => {
@@ -1588,7 +1513,6 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::Layer,
            Generation, Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -1731,11 +1655,7 @@ mod tests {
        let client = timeline.remote_client.as_ref().unwrap();

        // Download back the index.json, and check that the list of files is correct
-        let initial_index_part = match client
-            .download_index_file(CancellationToken::new())
-            .await
-            .unwrap()
-        {
+        let initial_index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1761,29 +1681,32 @@ mod tests {
        let generation = harness.generation;

        // Create a couple of dummy files,  schedule upload for them
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_2: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap();
+        let layer_file_name_3: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap();
+        let content_1 = dummy_contents("foo");
+        let content_2 = dummy_contents("bar");
+        let content_3 = dummy_contents("baz");

-        let layers = [
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), dummy_contents("foo")),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D9-00000000016B5A52".parse().unwrap(), dummy_contents("bar")),
-            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
-        ]
-        .into_iter()
-        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
-            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
-
-            Layer::for_resident(
-                harness.conf,
-                &timeline,
-                name,
-                LayerFileMetadata::new(contents.len() as u64, generation),
-            )
-        }).collect::<Vec<_>>();
+        for (filename, content) in [
+            (&layer_file_name_1, &content_1),
+            (&layer_file_name_2, &content_2),
+            (&layer_file_name_3, &content_3),
+        ] {
+            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
+        }

        client
-            .schedule_layer_file_upload(layers[0].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64, generation),
+            )
            .unwrap();
        client
-            .schedule_layer_file_upload(layers[1].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_2,
+                &LayerFileMetadata::new(content_2.len() as u64, generation),
+            )
            .unwrap();

        // Check that they are started immediately, not queued
@@ -1824,11 +1747,7 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match client
-            .download_index_file(CancellationToken::new())
-            .await
-            .unwrap()
-        {
+        let index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1841,42 +1760,38 @@ mod tests {
                .collect(),
            &[
                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
+                &layer_file_name_2.file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
-            .schedule_layer_file_upload(layers[2].clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_3,
+                &LayerFileMetadata::new(content_3.len() as u64, generation),
+            )
            .unwrap();
-
-        // this is no longer consistent with how deletion works with Layer::drop, but in this test
-        // keep using schedule_layer_file_deletion because we don't have a way to wait for the
-        // spawn_blocking started by the drop.
        client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .schedule_layer_file_deletion([layer_file_name_1.clone()].to_vec())
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();

            // Deletion schedules upload of the index file, and the file deletion itself
-            assert_eq!(upload_queue.queued_operations.len(), 2);
-            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
-            assert_eq!(
-                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
-                0
-            );
+            assert!(upload_queue.queued_operations.len() == 2);
+            assert!(upload_queue.inprogress_tasks.len() == 1);
+            assert!(upload_queue.num_inprogress_layer_uploads == 1);
+            assert!(upload_queue.num_inprogress_deletions == 0);
+            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 0);
        }
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &layer_file_name_1.file_name(),
+                &layer_file_name_2.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1890,8 +1805,8 @@ mod tests {
        assert_remote_files(
            &[
                &initial_layer.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
-                &layers[2].layer_desc().filename().file_name(),
+                &layer_file_name_2.file_name(),
+                &layer_file_name_3.file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -1920,13 +1835,6 @@ mod tests {
        )
        .unwrap();

-        let layer_file_1 = Layer::for_resident(
-            harness.conf,
-            &timeline,
-            layer_file_name_1.clone(),
-            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
-        );
-
        #[derive(Debug, PartialEq, Clone, Copy)]
        struct BytesStartedFinished {
            started: Option<usize>,
@@ -1962,7 +1870,10 @@ mod tests {
        let actual_a = get_bytes_started_stopped();

        client
-            .schedule_layer_file_upload(layer_file_1.clone())
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64, harness.generation),
+            )
            .unwrap();

        let actual_b = get_bytes_started_stopped();
@@ -2027,7 +1938,7 @@ mod tests {
        let client = test_state.build_client(get_generation);

        let download_r = client
-            .download_index_file(CancellationToken::new())
+            .download_index_file()
            .await
            .expect("download should always succeed");
        assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -18,8 +18,8 @@ use crate::config::PageServerConf;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::Generation;
-use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
+use crate::tenant::{Generation, TENANT_DELETED_MARKER_FILE_NAME};
+use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

@@ -170,43 +170,53 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
 pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
    tenant_id: TenantId,
-    cancel: CancellationToken,
-) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
+) -> anyhow::Result<HashSet<TimelineId>> {
    let remote_path = remote_timelines_path(&tenant_id);

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
    });

-    let listing = download_retry_forever(
-        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
-        &format!("list timelines for {tenant_id}"),
-        cancel,
+    let timelines = download_retry(
+        || storage.list_prefixes(Some(&remote_path)),
+        &format!("list prefixes for {tenant_id}"),
    )
    .await?;

-    let mut timeline_ids = HashSet::new();
-    let mut other_prefixes = HashSet::new();
+    if timelines.is_empty() {
+        anyhow::bail!("no timelines found on the remote storage")
+    }
+
+    let mut timeline_ids = HashSet::new();
+
+    for timeline_remote_storage_key in timelines {
+        if timeline_remote_storage_key.object_name() == Some(TENANT_DELETED_MARKER_FILE_NAME) {
+            // A `deleted` key within `timelines/` is a marker file, not a timeline.  Ignore it.
+            // This code will be removed in https://github.com/neondatabase/neon/pull/5580
+            continue;
+        }

-    for timeline_remote_storage_key in listing.prefixes {
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
        })?;

-        match object_name.parse::<TimelineId>() {
-            Ok(t) => timeline_ids.insert(t),
-            Err(_) => other_prefixes.insert(object_name.to_string()),
-        };
+        let timeline_id: TimelineId = object_name
+            .parse()
+            .with_context(|| format!("parse object name into timeline id '{object_name}'"))?;
+
+        // list_prefixes is assumed to return unique names. Ensure this here.
+        // NB: it's safer to bail out than warn-log this because the pageserver
+        //     needs to absolutely know about _all_ timelines that exist, so that
+        //     GC knows all the branchpoints. If we skipped over a timeline instead,
+        //     GC could delete a layer that's still needed by that timeline.
+        anyhow::ensure!(
+            !timeline_ids.contains(&timeline_id),
+            "list_prefixes contains duplicate timeline id {timeline_id}"
+        );
+        timeline_ids.insert(timeline_id);
    }

-    for key in listing.keys {
-        let object_name = key
-            .object_name()
-            .ok_or_else(|| anyhow::anyhow!("object name for key {key}"))?;
-        other_prefixes.insert(object_name.to_string());
-    }
-
-    Ok((timeline_ids, other_prefixes))
+    Ok(timeline_ids)
 }

 async fn do_download_index_part(
@@ -214,11 +224,10 @@ async fn do_download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    index_generation: Generation,
-    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);

-    let index_part_bytes = download_retry_forever(
+    let index_part_bytes = download_retry(
        || async {
            let mut index_part_download = storage.download(&remote_path).await?;

@@ -233,7 +242,6 @@ async fn do_download_index_part(
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
-        cancel,
    )
    .await?;

@@ -255,28 +263,19 @@ pub(super) async fn download_index_part(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    my_generation: Generation,
-    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
-            .await;
+        return do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
    }

    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
    // index in our generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_id,
-        timeline_id,
-        my_generation,
-        cancel.clone(),
-    )
-    .await;
+    let res = do_download_index_part(storage, tenant_id, timeline_id, my_generation).await;
    match res {
        Ok(index_part) => {
            tracing::debug!(
@@ -296,14 +295,8 @@ pub(super) async fn download_index_part(
    //    we want to find the most recent index from a previous generation.
    //
    // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
-        storage,
-        tenant_id,
-        timeline_id,
-        my_generation.previous(),
-        cancel.clone(),
-    )
-    .await;
+    let res =
+        do_download_index_part(storage, tenant_id, timeline_id, my_generation.previous()).await;
    match res {
        Ok(index_part) => {
            tracing::debug!("Found index_part from previous generation");
@@ -347,14 +340,13 @@ pub(super) async fn download_index_part(
    match max_previous_generation {
        Some(g) => {
            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
+            do_download_index_part(storage, tenant_id, timeline_id, g).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
            tracing::info!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
-                .await
+            do_download_index_part(storage, tenant_id, timeline_id, Generation::none()).await
        }
    }
 }
@@ -384,23 +376,3 @@ where
    )
    .await
 }
-
-async fn download_retry_forever<T, O, F>(
-    op: O,
-    description: &str,
-    cancel: CancellationToken,
-) -> Result<T, DownloadError>
-where
-    O: FnMut() -> F,
-    F: Future<Output = Result<T, DownloadError>>,
-{
-    backoff::retry(
-        op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
-        FAILED_DOWNLOAD_WARN_THRESHOLD,
-        u32::MAX,
-        description,
-        backoff::Cancel::new(cancel, || DownloadError::Cancelled),
-    )
-    .await
-}
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -6,6 +6,7 @@ use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
 use utils::bin_ser::SerializeError;

 use crate::tenant::metadata::TimelineMetadata;
@@ -57,6 +58,7 @@ impl LayerFileMetadata {
 ///
 /// This type needs to be backwards and forwards compatible. When changing the fields,
 /// remember to add a test case for the changed version.
+#[serde_as]
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct IndexPart {
    /// Debugging aid describing the version of this type.
@@ -76,6 +78,7 @@ pub struct IndexPart {
    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated for convenience when reading the serialized structure, but is
    // private because internally we would read from metadata instead.
+    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,

    #[serde(rename = "metadata_bytes")]
@@ -95,7 +98,7 @@ impl IndexPart {
    const LATEST_VERSION: usize = 4;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];
+    pub const KNOWN_VERSIONS: &[usize] = &[1, 2, 3, 4];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -152,7 +155,7 @@ pub struct IndexLayerMetadata {

    #[serde(default = "Generation::none")]
    #[serde(skip_serializing_if = "Generation::is_none")]
-    pub generation: Generation,
+    pub(super) generation: Generation,
 }

 impl From<LayerFileMetadata> for IndexLayerMetadata {
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -60,8 +60,6 @@ pub(super) async fn upload_timeline_layer<'a>(
        bail!("failpoint before-upload-layer")
    });

-    pausable_failpoint!("before-upload-layer-pausable");
-
    let storage_path = remote_path(conf, source_path, generation)?;
    let source_file_res = fs::File::open(&source_path).await;
    let source_file = match source_file_res {
@@ -72,8 +70,6 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
-            //
-            // This is tested against `test_compaction_delete_before_upload`
            info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -29,6 +29,7 @@ use tenant_size_model::{Segment, StorageModel};
 /// needs. We will convert this into a StorageModel when it's time to perform
 /// the calculation.
 ///
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct ModelInputs {
    pub segments: Vec<SegmentMeta>,
@@ -36,9 +37,11 @@ pub struct ModelInputs {
 }

 /// A [`Segment`], with some extra information for display purposes
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct SegmentMeta {
    pub segment: Segment,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub kind: LsnKind,
 }
@@ -74,22 +77,32 @@ pub enum LsnKind {

 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
 /// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
+#[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub timeline_id: TimelineId,

+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    pub ancestor_id: Option<TimelineId>,

+    #[serde_as(as = "serde_with::DisplayFromStr")]
    ancestor_lsn: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    last_record: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    latest_gc_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    horizon_cutoff: Lsn,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    pitr_cutoff: Lsn,

    /// Cutoff point based on GC settings
+    #[serde_as(as = "serde_with::DisplayFromStr")]
    next_gc_cutoff: Lsn,

    /// Cutoff point calculated from the user-supplied 'max_retention_period'
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    retention_param_cutoff: Option<Lsn>,
 }

--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,21 +4,26 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
-mod layer;
 mod layer_desc;
+mod remote_layer;

+use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
+use anyhow::Result;
 use bytes::Bytes;
+use camino::Utf8PathBuf;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
+use pageserver_api::models::LayerAccessKind;
 use pageserver_api::models::{
-    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
+    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::sync::Mutex;
+use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
@@ -34,8 +39,7 @@ pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
-
-pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
+pub use remote_layer::RemoteLayer;

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -70,7 +74,7 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

-/// Return value from [`Layer::get_value_reconstruct_data`]
+/// Return value from Layer::get_page_reconstruct_data
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
    /// Got all the data needed to reconstruct the requested page
@@ -175,6 +179,26 @@ impl LayerAccessStats {
        new
    }

+    /// Creates a clone of `self` and records `new_status` in the clone.
+    ///
+    /// The `new_status` is not recorded in `self`.
+    ///
+    /// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
+    ///
+    /// [`record_residence_event`]: Self::record_residence_event
+    pub(crate) fn clone_for_residence_change(
+        &self,
+        new_status: LayerResidenceStatus,
+    ) -> LayerAccessStats {
+        let clone = {
+            let inner = self.0.lock().unwrap();
+            inner.clone()
+        };
+        let new = LayerAccessStats(Mutex::new(clone));
+        new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
+        new
+    }
+
    /// Record a change in layer residency.
    ///
    /// Recording the event must happen while holding the layer map lock to
@@ -297,12 +321,95 @@ impl LayerAccessStats {
    }
 }

+/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
+/// required by [`LayerMap`](super::layer_map::LayerMap).
+///
+/// All layers should implement a minimal `std::fmt::Debug` without tenant or
+/// timeline names, because those are known in the context of which the layers
+/// are used in (timeline).
+#[async_trait::async_trait]
+pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
+    ///
+    /// Return data needed to reconstruct given page at LSN.
+    ///
+    /// It is up to the caller to collect more data from previous layer and
+    /// perform WAL redo, if necessary.
+    ///
+    /// See PageReconstructResult for possible return values. The collected data
+    /// is appended to reconstruct_data; the caller should pass an empty struct
+    /// on first call, or a struct with a cached older image of the page if one
+    /// is available. If this returns ValueReconstructResult::Continue, look up
+    /// the predecessor layer and call again with the same 'reconstruct_data' to
+    /// collect more data.
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult>;
+}
+
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
    fn layer_desc(&self) -> &PersistentLayerDesc;
 }

+/// A Layer contains all data in a "rectangle" consisting of a range of keys and
+/// range of LSNs.
+///
+/// There are two kinds of layers, in-memory and on-disk layers. In-memory
+/// layers are used to ingest incoming WAL, and provide fast access to the
+/// recent page versions. On-disk layers are stored as files on disk, and are
+/// immutable. This trait presents the common functionality of in-memory and
+/// on-disk layers.
+///
+/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
+/// A delta layer contains all modifications within a range of LSNs and keys.
+/// An image layer is a snapshot of all the data in a key-range, at a single
+/// LSN.
+pub trait PersistentLayer: Layer + AsLayerDesc {
+    /// File name used for this layer, both in the pageserver's local filesystem
+    /// state as well as in the remote storage.
+    fn filename(&self) -> LayerFileName {
+        self.layer_desc().filename()
+    }
+
+    // Path to the layer file in the local filesystem.
+    // `None` for `RemoteLayer`.
+    fn local_path(&self) -> Option<Utf8PathBuf>;
+
+    /// Permanently remove this layer from disk.
+    fn delete_resident_layer_file(&self) -> Result<()>;
+
+    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        None
+    }
+
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        None
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        false
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
+
+    fn access_stats(&self) -> &LayerAccessStats;
+}
+
+pub fn downcast_remote_layer(
+    layer: &Arc<dyn PersistentLayer>,
+) -> Option<std::sync::Arc<RemoteLayer>> {
+    if layer.is_remote_layer() {
+        Arc::clone(layer).downcast_remote_layer()
+    } else {
+        None
+    }
+}
+
 pub mod tests {
    use super::*;

@@ -340,6 +447,19 @@ pub mod tests {
    }
 }

+/// Helper enum to hold a PageServerConf, or a path
+///
+/// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
+/// global config, and paths to layer files are constructed using the tenant/timeline
+/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
+/// struct for a file on disk, without having a page server running, so that we have no
+/// config. In that case, we use the Path variant to hold the full path to the file on
+/// disk.
+enum PathOrConf {
+    Path(Utf8PathBuf),
+    Conf(&'static PageServerConf),
+}
+
 /// Range wrapping newtype, which uses display to render Debug.
 ///
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -34,17 +34,18 @@ use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
-use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::storage_layer::{
+    PersistentLayer, ValueReconstructResult, ValueReconstructState,
+};
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
@@ -58,7 +59,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
+use super::{
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
+    PersistentLayerDesc,
+};

 ///
 /// Header stored in the beginning of the file
@@ -178,12 +182,20 @@ impl DeltaKey {
    }
 }

-/// This is used only from `pagectl`. Within pageserver, all layers are
-/// [`crate::tenant::storage_layer::Layer`], which can hold a [`DeltaLayerInner`].
+/// DeltaLayer is the in-memory data structure associated with an on-disk delta
+/// file.
+///
+/// We keep a DeltaLayer in memory for each file, in the LayerMap. If a layer
+/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
+/// Otherwise the struct is just a placeholder for a file that exists on disk,
+/// and it needs to be loaded before using it in queries.
 pub struct DeltaLayer {
-    path: Utf8PathBuf,
+    path_or_conf: PathOrConf,
+
    pub desc: PersistentLayerDesc,
+
    access_stats: LayerAccessStats,
+
    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

@@ -200,8 +212,6 @@ impl std::fmt::Debug for DeltaLayer {
    }
 }

-/// `DeltaLayerInner` is the in-memory data structure associated with an on-disk delta
-/// file.
 pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -211,6 +221,12 @@ pub struct DeltaLayerInner {
    file: FileBlockReader,
 }

+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -220,6 +236,19 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

+#[async_trait::async_trait]
+impl Layer for DeltaLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+}
 /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
 impl std::fmt::Display for DeltaLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -233,9 +262,40 @@ impl AsLayerDesc for DeltaLayer {
    }
 }

+impl PersistentLayer for DeltaLayer {
+    fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
+        Some(self)
+    }
+
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        self.local_path()
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        self.delete_resident_layer_file()
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.info(reset)
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        self.access_stats()
+    }
+}
+
 impl DeltaLayer {
    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        self.desc.dump();
+        println!(
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end,
+            self.desc.file_size,
+        );

        if !verbose {
            return Ok(());
@@ -243,7 +303,119 @@ impl DeltaLayer {

        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

-        inner.dump(ctx).await
+        println!(
+            "index_start_blk: {}, root {}",
+            inner.index_start_blk, inner.index_root_blk
+        );
+
+        let file = &inner.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        tree_reader.dump().await?;
+
+        let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
+
+        // A subroutine to dump a single blob
+        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
+            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        }
+
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val, ctx).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    let err: anyhow::Error = err;
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        ensure!(lsn_range.start >= self.desc.lsn_range.start);
+
+        ensure!(self.desc.key_range.contains(&key));
+
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+
+    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
+        Some(self.path())
+    }
+
+    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
+    }
+
+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_range = self.layer_desc().lsn_range.clone();
+
+        let access_stats = self.access_stats.as_api_model(reset);
+
+        HistoricLayerInfo::Delta {
+            layer_file_name,
+            layer_file_size: self.desc.file_size,
+            lsn_start: lsn_range.start,
+            lsn_end: lsn_range.end,
+            remote: false,
+            access_stats,
+        }
+    }
+
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+
+    fn path_for(
+        path_or_conf: &PathOrConf,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        fname: &DeltaFileName,
+    ) -> Utf8PathBuf {
+        match path_or_conf {
+            PathOrConf::Path(path) => path.clone(),
+            PathOrConf::Conf(conf) => conf
+                .timeline_path(tenant_id, timeline_id)
+                .join(fname.to_string()),
+        }
    }

    fn temp_path_for(
@@ -289,21 +461,52 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
+        let summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;

-        if actual_filename != expected_filename {
-            println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code
+
+            let actual_filename = path.file_name().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
+            }
        }

        Ok(Arc::new(loaded))
    }

+    /// Create a DeltaLayer struct representing an existing file on disk.
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        filename: &DeltaFileName,
+        file_size: u64,
+        access_stats: LayerAccessStats,
+    ) -> DeltaLayer {
+        DeltaLayer {
+            path_or_conf: PathOrConf::Conf(conf),
+            desc: PersistentLayerDesc::new_delta(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn_range.clone(),
+                file_size,
+            ),
+            access_stats,
+            inner: OnceCell::new(),
+        }
+    }
+
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -317,7 +520,7 @@ impl DeltaLayer {
            .context("get file metadata to determine size")?;

        Ok(DeltaLayer {
-            path: path.to_path_buf(),
+            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            desc: PersistentLayerDesc::new_delta(
                summary.tenant_id,
                summary.timeline_id,
@@ -330,9 +533,29 @@ impl DeltaLayer {
        })
    }

+    fn layer_name(&self) -> DeltaFileName {
+        self.desc.delta_file_name()
+    }
    /// Path to the layer file in pageserver workdir.
-    fn path(&self) -> Utf8PathBuf {
-        self.path.clone()
+    pub fn path(&self) -> Utf8PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            &self.desc.tenant_id,
+            &self.desc.timeline_id,
+            &self.layer_name(),
+        )
+    }
+    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
+    ///
+    /// The value can be obtained via the [`ValueRef::load`] function.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .await
+            .context("load delta layer keys")?;
+        DeltaLayerInner::load_keys(inner, ctx)
+            .await
+            .context("Layer index is corrupted")
    }
 }

@@ -437,7 +660,7 @@ impl DeltaLayerWriterInner {
    ///
    /// Finish writing the delta layer.
    ///
-    async fn finish(self, key_end: Key, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -494,21 +717,37 @@ impl DeltaLayerWriterInner {
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
-
-        let desc = PersistentLayerDesc::new_delta(
-            self.tenant_id,
-            self.timeline_id,
-            self.key_start..key_end,
-            self.lsn_range.clone(),
-            metadata.len(),
-        );
+        let layer = DeltaLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            desc: PersistentLayerDesc::new_delta(
+                self.tenant_id,
+                self.timeline_id,
+                self.key_start..key_end,
+                self.lsn_range.clone(),
+                metadata.len(),
+            ),
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            inner: OnceCell::new(),
+        };

        // fsync the file
        file.sync_all().await?;
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = DeltaLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            &self.tenant_id,
+            &self.timeline_id,
+            &DeltaFileName {
+                key_range: self.key_start..key_end,
+                lsn_range: self.lsn_range,
+            },
+        );
+        std::fs::rename(self.path, &final_path)?;

-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
-
-        trace!("created delta layer {}", layer.local_path());
+        trace!("created delta layer {final_path}");

        Ok(layer)
    }
@@ -589,12 +828,8 @@ impl DeltaLayerWriter {
    ///
    /// Finish writing the delta layer.
    ///
-    pub(crate) async fn finish(
-        mut self,
-        key_end: Key,
-        timeline: &Arc<Timeline>,
-    ) -> anyhow::Result<ResidentLayer> {
-        self.inner.take().unwrap().finish(key_end, timeline).await
+    pub async fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
+        self.inner.take().unwrap().finish(key_end).await
    }
 }

@@ -732,17 +967,15 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_keys<'a>(
-        &'a self,
-        ctx: &RequestContext,
+    pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
+        this: &'a T,
+        ctx: &'b RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
-        let file = &self.file;
+        let dl = this.as_ref();
+        let file = &dl.file;

-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
+        let tree_reader =
+            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();

@@ -755,7 +988,7 @@ impl DeltaLayerInner {
                    let val_ref = ValueRef {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
-                            Adapter(self),
+                            Adapter(dl),
                        )),
                    };
                    let pos = BlobRef(value).pos();
@@ -782,61 +1015,10 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
-
-    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        println!(
-            "index_start_blk: {}, root {}",
-            self.index_start_blk, self.index_root_blk
-        );
-
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-
-        tree_reader.dump().await?;
-
-        let keys = self.load_keys(ctx).await?;
-
-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
-            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
-        }
-
-        for entry in keys {
-            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
-                Ok(desc) => desc,
-                Err(err) => {
-                    format!("ERROR: {err}")
-                }
-            };
-            println!("  key {key} at {lsn}: {desc}");
-        }
-
-        Ok(())
-    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -876,9 +1058,3 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
        self.0.as_ref().file.read_blk(blknum, ctx).await
    }
 }
-
-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -31,23 +31,21 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
-    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
+    LayerAccessStats, PersistentLayer, ValueReconstructResult, ValueReconstructState,
 };
-use crate::tenant::Timeline;
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::fs::File;
+use std::fs::{self, File};
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
-use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -58,7 +56,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
+use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -116,14 +114,22 @@ impl Summary {
    }
 }

-/// This is used only from `pagectl`. Within pageserver, all layers are
-/// [`crate::tenant::storage_layer::Layer`], which can hold an [`ImageLayerInner`].
+/// ImageLayer is the in-memory data structure associated with an on-disk image
+/// file.
+///
+/// We keep an ImageLayer in memory for each file, in the LayerMap. If a layer
+/// is in "loaded" state, we have a copy of the index in memory, in 'inner'.
+/// Otherwise the struct is just a placeholder for a file that exists on disk,
+/// and it needs to be loaded before using it in queries.
 pub struct ImageLayer {
-    path: Utf8PathBuf,
+    path_or_conf: PathOrConf,
+
    pub desc: PersistentLayerDesc,
    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,
+
    access_stats: LayerAccessStats,
+
    inner: OnceCell<ImageLayerInner>,
 }

@@ -140,8 +146,6 @@ impl std::fmt::Debug for ImageLayer {
    }
 }

-/// ImageLayer is the in-memory data structure associated with an on-disk image
-/// file.
 pub struct ImageLayerInner {
    // values copied from summary
    index_start_blk: u32,
@@ -162,11 +166,73 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-impl ImageLayerInner {
-    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let file = &self.file;
+#[async_trait::async_trait]
+impl Layer for ImageLayer {
+    /// Look up given page in the file
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
+            .await
+    }
+}
+
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for ImageLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
+    }
+}
+
+impl AsLayerDesc for ImageLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
+    }
+}
+
+impl PersistentLayer for ImageLayer {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        self.local_path()
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        self.delete_resident_layer_file()
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        self.info(reset)
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        self.access_stats()
+    }
+}
+
+impl ImageLayer {
+    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.lsn,
+            self.desc.is_incremental(),
+            self.desc.file_size
+        );
+
+        if !verbose {
+            return Ok(());
+        }
+
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let file = &inner.file;
        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
+            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

        tree_reader.dump().await?;

@@ -184,36 +250,69 @@ impl ImageLayerInner {

        Ok(())
    }
-}

-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for ImageLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+    pub(crate) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        assert!(self.desc.key_range.contains(&key));
+        assert!(lsn_range.start >= self.lsn);
+        assert!(lsn_range.end >= self.lsn);
+
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, reconstruct_state, ctx)
+            .await
+            // FIXME: makes no sense to dump paths
+            .with_context(|| format!("read {}", self.path()))
    }
-}

-impl AsLayerDesc for ImageLayer {
-    fn layer_desc(&self) -> &PersistentLayerDesc {
-        &self.desc
+    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
+        Some(self.path())
    }
-}
-
-impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        self.desc.dump();
-
-        if !verbose {
-            return Ok(());
-        }
-
-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
-
-        inner.dump(ctx).await?;

+    pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
        Ok(())
    }

+    pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_start = self.layer_desc().image_layer_lsn();
+
+        HistoricLayerInfo::Image {
+            layer_file_name,
+            layer_file_size: self.desc.file_size,
+            lsn_start,
+            remote: false,
+            access_stats: self.access_stats.as_api_model(reset),
+        }
+    }
+
+    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+
+    fn path_for(
+        path_or_conf: &PathOrConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        fname: &ImageFileName,
+    ) -> Utf8PathBuf {
+        match path_or_conf {
+            PathOrConf::Path(path) => path.to_path_buf(),
+            PathOrConf::Conf(conf) => conf
+                .timeline_path(&tenant_id, &timeline_id)
+                .join(fname.to_string()),
+        }
+    }
+
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
@@ -249,21 +348,54 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
+        let expected_summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let loaded =
+            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
+                .await?;

-        if actual_filename != expected_filename {
-            println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code
+            let actual_filename = path.file_name().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
+            }
        }

        Ok(loaded)
    }

+    /// Create an ImageLayer struct representing an existing file on disk
+    pub fn new(
+        conf: &'static PageServerConf,
+        timeline_id: TimelineId,
+        tenant_id: TenantId,
+        filename: &ImageFileName,
+        file_size: u64,
+        access_stats: LayerAccessStats,
+    ) -> ImageLayer {
+        ImageLayer {
+            path_or_conf: PathOrConf::Conf(conf),
+            desc: PersistentLayerDesc::new_img(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn,
+                file_size,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
+            lsn: filename.lsn,
+            access_stats,
+            inner: OnceCell::new(),
+        }
+    }
+
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
@@ -275,7 +407,7 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
-            path: path.to_path_buf(),
+            path_or_conf: PathOrConf::Path(path.to_path_buf()),
            desc: PersistentLayerDesc::new_img(
                summary.tenant_id,
                summary.timeline_id,
@@ -289,8 +421,18 @@ impl ImageLayer {
        })
    }

-    fn path(&self) -> Utf8PathBuf {
-        self.path.clone()
+    fn layer_name(&self) -> ImageFileName {
+        self.desc.image_file_name()
+    }
+
+    /// Path to the layer file in pageserver workdir.
+    pub fn path(&self) -> Utf8PathBuf {
+        Self::path_for(
+            &self.path_or_conf,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
+            &self.layer_name(),
+        )
    }
 }

@@ -462,7 +604,7 @@ impl ImageLayerWriterInner {
    ///
    /// Finish writing the image layer.
    ///
-    async fn finish(self, timeline: &Arc<Timeline>) -> anyhow::Result<ResidentLayer> {
+    async fn finish(self) -> anyhow::Result<ImageLayer> {
        let index_start_blk =
            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

@@ -516,14 +658,33 @@ impl ImageLayerWriterInner {
        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
+        let layer = ImageLayer {
+            path_or_conf: PathOrConf::Conf(self.conf),
+            desc,
+            lsn: self.lsn,
+            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
+            inner: OnceCell::new(),
+        };

        // fsync the file
        file.sync_all().await?;

-        // FIXME: why not carry the virtualfile here, it supports renaming?
-        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
+        // Rename the file to its final name
+        //
+        // Note: This overwrites any existing file. There shouldn't be any.
+        // FIXME: throw an error instead?
+        let final_path = ImageLayer::path_for(
+            &PathOrConf::Conf(self.conf),
+            self.timeline_id,
+            self.tenant_id,
+            &ImageFileName {
+                key_range: self.key_range.clone(),
+                lsn: self.lsn,
+            },
+        );
+        std::fs::rename(self.path, final_path)?;

-        trace!("created image layer {}", layer.local_path());
+        trace!("created image layer {}", layer.path());

        Ok(layer)
    }
@@ -585,11 +746,8 @@ impl ImageLayerWriter {
    ///
    /// Finish writing the image layer.
    ///
-    pub(crate) async fn finish(
-        mut self,
-        timeline: &Arc<Timeline>,
-    ) -> anyhow::Result<super::ResidentLayer> {
-        self.inner.take().unwrap().finish(timeline).await
+    pub async fn finish(mut self) -> anyhow::Result<ImageLayer> {
+        self.inner.take().unwrap().finish().await
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -10,12 +10,11 @@ use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::collections::HashMap;
-use std::sync::{Arc, OnceLock};
+use std::sync::OnceLock;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -29,7 +28,7 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::RwLock;

-use super::{DeltaLayerWriter, ResidentLayer};
+use super::{DeltaLayer, DeltaLayerWriter, Layer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -208,6 +207,20 @@ impl InMemoryLayer {
    }
 }

+#[async_trait::async_trait]
+impl Layer for InMemoryLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
+            .await
+    }
+}
+
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
@@ -216,13 +229,17 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
+    ///
    /// Get layer size.
+    ///
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
        Ok(inner.file.len())
    }

+    ///
    /// Create a new, empty, in-memory layer
+    ///
    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
@@ -314,11 +331,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub(crate) async fn write_to_disk(
-        &self,
-        timeline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> Result<ResidentLayer> {
+    pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -363,8 +376,7 @@ impl InMemoryLayer {
            }
        }

-        // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX).await?;
        Ok(delta_layer)
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,3 +1,4 @@
+use anyhow::Result;
 use core::fmt::Display;
 use std::ops::Range;
 use utils::{
@@ -5,7 +6,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::repository::Key;
+use crate::{context::RequestContext, repository::Key};

 use super::{DeltaFileName, ImageFileName, LayerFileName};

@@ -99,22 +100,6 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn from_filename(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        filename: LayerFileName,
-        file_size: u64,
-    ) -> Self {
-        match filename {
-            LayerFileName::Image(i) => {
-                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
-            }
-            LayerFileName::Delta(d) => {
-                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
-            }
-        }
-    }
-
    /// Get the LSN that the image layer covers.
    pub fn image_layer_lsn(&self) -> Lsn {
        assert!(!self.is_delta);
@@ -188,31 +173,21 @@ impl PersistentLayerDesc {
        self.is_delta
    }

-    pub fn dump(&self) {
-        if self.is_delta {
-            println!(
-                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
-                self.tenant_id,
-                self.timeline_id,
-                self.key_range.start,
-                self.key_range.end,
-                self.lsn_range.start,
-                self.lsn_range.end,
-                self.is_incremental(),
-                self.file_size,
-            );
-        } else {
-            println!(
-                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-                self.tenant_id,
-                self.timeline_id,
-                self.key_range.start,
-                self.key_range.end,
-                self.image_layer_lsn(),
-                self.is_incremental(),
-                self.file_size
-            );
-        }
+    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end,
+            self.is_delta,
+            self.is_incremental(),
+            self.file_size,
+        );
+
+        Ok(())
    }

    pub fn file_size(&self) -> u64 {
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -0,0 +1,216 @@
+//! A RemoteLayer is an in-memory placeholder for a layer file that exists
+//! in remote storage.
+//!
+use crate::config::PageServerConf;
+use crate::context::RequestContext;
+use crate::repository::Key;
+use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
+use crate::tenant::timeline::layer_manager::LayerManager;
+use anyhow::{bail, Result};
+use camino::Utf8PathBuf;
+use pageserver_api::models::HistoricLayerInfo;
+use std::ops::Range;
+use std::sync::Arc;
+
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use super::filename::{DeltaFileName, ImageFileName};
+use super::{
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+};
+
+/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
+/// [`DeltaLayer`].
+///
+/// RemoteLayer might be downloaded on-demand during operations which are
+/// allowed download remote layers and during which, it gets replaced with a
+/// concrete `DeltaLayer` or `ImageLayer`.
+///
+/// See: [`crate::context::RequestContext`] for authorization to download
+pub struct RemoteLayer {
+    pub desc: PersistentLayerDesc,
+
+    pub layer_metadata: LayerFileMetadata,
+
+    access_stats: LayerAccessStats,
+
+    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+
+    /// Has `LayerMap::replace` failed for this (true) or not (false).
+    ///
+    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
+    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
+    /// unprocessable, because a LayerMap::replace failed.
+    ///
+    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
+    /// a possible fast loop between `Timeline::get_reconstruct_data` and
+    /// `Timeline::download_remote_layer`, which also logs.
+    ///
+    /// [`ongoing_download`]: Self::ongoing_download
+    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
+}
+
+impl std::fmt::Debug for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RemoteLayer")
+            .field("file_name", &self.desc.filename())
+            .field("layer_metadata", &self.layer_metadata)
+            .field("is_incremental", &self.desc.is_incremental())
+            .finish()
+    }
+}
+
+#[async_trait::async_trait]
+impl Layer for RemoteLayer {
+    async fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
+        Err(anyhow::anyhow!("layer {self} needs to be downloaded"))
+    }
+}
+
+/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+impl std::fmt::Display for RemoteLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.layer_desc().short_id())
+    }
+}
+
+impl AsLayerDesc for RemoteLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
+        None
+    }
+
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        bail!("remote layer has no layer file");
+    }
+
+    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
+        Some(self)
+    }
+
+    fn is_remote_layer(&self) -> bool {
+        true
+    }
+
+    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
+        let layer_file_name = self.layer_desc().filename().file_name();
+        let lsn_range = self.layer_desc().lsn_range.clone();
+
+        if self.desc.is_delta {
+            HistoricLayerInfo::Delta {
+                layer_file_name,
+                layer_file_size: self.layer_metadata.file_size(),
+                lsn_start: lsn_range.start,
+                lsn_end: lsn_range.end,
+                remote: true,
+                access_stats: self.access_stats.as_api_model(reset),
+            }
+        } else {
+            HistoricLayerInfo::Image {
+                layer_file_name,
+                layer_file_size: self.layer_metadata.file_size(),
+                lsn_start: lsn_range.start,
+                remote: true,
+                access_stats: self.access_stats.as_api_model(reset),
+            }
+        }
+    }
+
+    fn access_stats(&self) -> &LayerAccessStats {
+        &self.access_stats
+    }
+}
+
+impl RemoteLayer {
+    pub fn new_img(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &ImageFileName,
+        layer_metadata: &LayerFileMetadata,
+        access_stats: LayerAccessStats,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            desc: PersistentLayerDesc::new_img(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn,
+                layer_metadata.file_size(),
+            ),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
+            access_stats,
+        }
+    }
+
+    pub fn new_delta(
+        tenantid: TenantId,
+        timelineid: TimelineId,
+        fname: &DeltaFileName,
+        layer_metadata: &LayerFileMetadata,
+        access_stats: LayerAccessStats,
+    ) -> RemoteLayer {
+        RemoteLayer {
+            desc: PersistentLayerDesc::new_delta(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn_range.clone(),
+                layer_metadata.file_size(),
+            ),
+            layer_metadata: layer_metadata.clone(),
+            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
+            access_stats,
+        }
+    }
+
+    /// Create a Layer struct representing this layer, after it has been downloaded.
+    pub(crate) fn create_downloaded_layer(
+        &self,
+        _layer_map_lock_held_witness: &LayerManager,
+        conf: &'static PageServerConf,
+        file_size: u64,
+    ) -> Arc<dyn PersistentLayer> {
+        if self.desc.is_delta {
+            let fname = self.desc.delta_file_name();
+            Arc::new(DeltaLayer::new(
+                conf,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
+                &fname,
+                file_size,
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+            ))
+        } else {
+            let fname = self.desc.image_file_name();
+            Arc::new(ImageLayer::new(
+                conf,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
+                &fname,
+                file_size,
+                self.access_stats
+                    .clone_for_residence_change(LayerResidenceStatus::Resident),
+            ))
+        }
+    }
+}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -12,7 +12,7 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{backoff, completion};
+use utils::completion;

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
@@ -139,10 +139,7 @@ pub fn start_background_loops(
 /// Compaction task's main loop
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    const MAX_BACKOFF_SECS: f64 = 300.0;
-    // How many errors we have seen consequtively
-    let mut error_run_count = 0;
-
+    let wait_duration = Duration::from_secs(2);
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -179,19 +176,9 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run compaction
                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
-                        wait_duration
-                    );
-                    Duration::from_secs_f64(wait_duration)
+                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
                } else {
-                    error_run_count = 0;
                    period
                }
            };
@@ -215,10 +202,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 /// GC task's main loop
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
-    const MAX_BACKOFF_SECS: f64 = 300.0;
-    // How many errors we have seen consequtively
-    let mut error_run_count = 0;
-
+    let wait_duration = Duration::from_secs(2);
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -260,19 +244,9 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
                    .await;
                if let Err(e) = res {
-                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
-                        1.0,
-                        MAX_BACKOFF_SECS,
-                    );
-                    error_run_count += 1;
-                    error!(
-                        "Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
-                        wait_duration
-                    );
-                    Duration::from_secs_f64(wait_duration)
+                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
                } else {
-                    error_run_count = 0;
                    period
                }
            };
@@ -361,7 +335,7 @@ pub(crate) fn warn_when_period_overrun(
        // humantime does no significant digits clamping whereas Duration's debug is a bit more
        // intelligent. however it makes sense to keep the "configuration format" for period, even
        // though there's no way to output the actual config value.
-        info!(
+        warn!(
            ?elapsed,
            period = %humantime::format_duration(period),
            ?task,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -38,14 +38,6 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    }
    debug!("wal receiver shutdown confirmed");

-    // Shut down the layer flush task before the remote client, as one depends on the other
-    task_mgr::shutdown_tasks(
-        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_id),
-        Some(timeline.timeline_id),
-    )
-    .await;
-
    // Prevent new uploads from starting.
    if let Some(remote_client) = timeline.remote_client.as_ref() {
        let res = remote_client.stop();
@@ -302,7 +294,6 @@ async fn cleanup_remaining_timeline_fs_traces(
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
-        .or_else(fs_ext::ignore_not_found)
        .context("remove delete mark")
 }

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -29,6 +29,7 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
+        storage_layer::PersistentLayer,
        tasks::{BackgroundLoopKind, RateLimitError},
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
@@ -209,26 +210,15 @@ impl Timeline {
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<_> = {
+        let candidates: Vec<Arc<dyn PersistentLayer>> = {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
-
-                // guard against eviction while we inspect it; it might be that eviction_task and
-                // disk_usage_eviction_task both select the same layers to be evicted, and
-                // seemingly free up double the space. both succeeding is of no consequence.
-                let guard = match hist_layer.keep_resident().await {
-                    Ok(Some(l)) => l,
-                    Ok(None) => continue,
-                    Err(e) => {
-                        // these should not happen, but we cannot make them statically impossible right
-                        // now.
-                        tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}");
-                        continue;
-                    }
-                };
+                if hist_layer.is_remote_layer() {
+                    continue;
+                }

                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
@@ -259,7 +249,7 @@ impl Timeline {
                    }
                };
                if no_activity_for > p.threshold {
-                    candidates.push(guard.drop_eviction_guard())
+                    candidates.push(hist_layer)
                }
            }
            candidates
@@ -278,7 +268,7 @@ impl Timeline {
        };

        let results = match self
-            .evict_layer_batch(remote_client, &candidates, cancel)
+            .evict_layer_batch(remote_client, &candidates[..], cancel.clone())
            .await
        {
            Err(pre_err) => {
@@ -289,7 +279,7 @@ impl Timeline {
            Ok(results) => results,
        };
        assert_eq!(results.len(), candidates.len());
-        for result in results {
+        for (l, result) in candidates.iter().zip(results) {
            match result {
                None => {
                    stats.skipped_for_shutdown += 1;
@@ -297,10 +287,24 @@ impl Timeline {
                Some(Ok(())) => {
                    stats.evicted += 1;
                }
-                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
+                Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
+                    stats.not_evictable += 1;
+                }
+                Some(Err(EvictionError::FileNotFound)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
+                Some(Err(
+                    e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
+                )) => {
+                    let e = utils::error::report_compact_sources(&e);
+                    warn!(layer = %l, "failed to evict layer: {e}");
+                    stats.not_evictable += 1;
+                }
+                Some(Err(EvictionError::MetadataInconsistency(detail))) => {
+                    warn!(layer = %l, "failed to evict layer: {detail}");
+                    stats.not_evictable += 1;
+                }
            }
        }
        if stats.candidates == stats.not_evictable {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -12,16 +12,27 @@ use crate::{
    tenant::{
        layer_map::{BatchedUpdates, LayerMap},
        storage_layer::{
-            AsLayerDesc, InMemoryLayer, Layer, PersistentLayerDesc, PersistentLayerKey,
-            ResidentLayer,
+            AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
+            PersistentLayerDesc, PersistentLayerKey,
        },
+        timeline::compare_arced_layers,
    },
 };

 /// Provides semantic APIs to manipulate the layer map.
 pub(crate) struct LayerManager {
    layer_map: LayerMap,
-    layer_fmgr: LayerFileManager<Layer>,
+    layer_fmgr: LayerFileManager,
+}
+
+/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
+/// scheduling deletes in remote client.
+pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
+
+impl ApplyGcResultGuard<'_> {
+    pub(crate) fn flush(self) {
+        self.0.flush();
+    }
 }

 impl LayerManager {
@@ -32,7 +43,7 @@ impl LayerManager {
        }
    }

-    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
+    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
        self.layer_fmgr.get_from_desc(desc)
    }

@@ -44,12 +55,21 @@ impl LayerManager {
        &self.layer_map
    }

+    /// Replace layers in the layer file manager, used in evictions and layer downloads.
+    pub(crate) fn replace_and_verify(
+        &mut self,
+        expected: Arc<dyn PersistentLayer>,
+        new: Arc<dyn PersistentLayer>,
+    ) -> Result<()> {
+        self.layer_fmgr.replace_and_verify(expected, new)
+    }
+
    /// Called from `load_layer_map`. Initialize the layer manager with:
    /// 1. all on-disk layers
    /// 2. next open layer (with disk disk_consistent_lsn LSN)
    pub(crate) fn initialize_local_layers(
        &mut self,
-        on_disk_layers: Vec<Layer>,
+        on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
        next_open_layer_at: Lsn,
    ) {
        let mut updates = self.layer_map.batch_update();
@@ -144,19 +164,10 @@ impl LayerManager {
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
-    pub(crate) fn track_new_image_layers(
-        &mut self,
-        image_layers: &[ResidentLayer],
-        metrics: &TimelineMetrics,
-    ) {
+    pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
-            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-
-            // record these here instead of Layer::finish_creating because otherwise partial
-            // failure with create_image_layers would balloon up the physical size gauge. downside
-            // is that all layers need to be created before metrics are updated.
-            metrics.record_new_file_metrics(layer.layer_desc().file_size);
+            Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
    }
@@ -164,71 +175,76 @@ impl LayerManager {
    /// Flush a frozen layer and add the written delta layer to the layer map.
    pub(crate) fn finish_flush_l0_layer(
        &mut self,
-        delta_layer: Option<&ResidentLayer>,
+        delta_layer: Option<DeltaLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
-        metrics: &TimelineMetrics,
    ) {
-        let inmem = self
-            .layer_map
-            .frozen_layers
-            .pop_front()
-            .expect("there must be a inmem layer to flush");
+        let l = self.layer_map.frozen_layers.pop_front();
+        let mut updates = self.layer_map.batch_update();

-        // Only one task may call this function at a time (for this
-        // timeline). If two tasks tried to flush the same frozen
+        // Only one thread may call this function at a time (for this
+        // timeline). If two threads tried to flush the same frozen
        // layer to disk at the same time, that would not work.
-        assert_eq!(Arc::as_ptr(&inmem), Arc::as_ptr(frozen_layer_for_check));
+        assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));

-        if let Some(l) = delta_layer {
-            let mut updates = self.layer_map.batch_update();
-            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-            metrics.record_new_file_metrics(l.layer_desc().file_size);
-            updates.flush();
+        if let Some(delta_layer) = delta_layer {
+            Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
        }
+        updates.flush();
    }

    /// Called when compaction is completed.
    pub(crate) fn finish_compact_l0(
        &mut self,
-        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        compact_from: &[Layer],
-        compact_to: &[ResidentLayer],
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        compact_from: Vec<Arc<dyn PersistentLayer>>,
+        compact_to: Vec<Arc<dyn PersistentLayer>>,
        metrics: &TimelineMetrics,
-    ) {
+    ) -> Result<()> {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
-            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
-            metrics.record_new_file_metrics(l.layer_desc().file_size);
+            Self::insert_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
        for l in compact_from {
-            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
+            // NB: the layer file identified by descriptor `l` is guaranteed to be present
+            // in the LayerFileManager because compaction kept holding `layer_removal_cs` the entire
+            // time, even though we dropped `Timeline::layers` inbetween.
+            Self::delete_historic_layer(
+                layer_removal_cs.clone(),
+                l,
+                &mut updates,
+                metrics,
+                &mut self.layer_fmgr,
+            )?;
        }
        updates.flush();
+        Ok(())
    }

    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
    pub(crate) fn finish_gc_timeline(
        &mut self,
-        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        gc_layers: Vec<Layer>,
-    ) {
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        gc_layers: Vec<Arc<dyn PersistentLayer>>,
+        metrics: &TimelineMetrics,
+    ) -> Result<ApplyGcResultGuard> {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
            Self::delete_historic_layer(
-                layer_removal_cs,
-                &doomed_layer,
+                layer_removal_cs.clone(),
+                doomed_layer,
                &mut updates,
+                metrics,
                &mut self.layer_fmgr,
-            );
+            )?; // FIXME: schedule succeeded deletions in timeline.rs `gc_timeline` instead of in batch?
        }
-        updates.flush()
+        Ok(ApplyGcResultGuard(updates))
    }

    /// Helper function to insert a layer into the layer map and file manager.
    fn insert_historic_layer(
-        layer: Layer,
+        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager<Layer>,
+        mapping: &mut LayerFileManager,
    ) {
        updates.insert_historic(layer.layer_desc().clone());
        mapping.insert(layer);
@@ -238,12 +254,17 @@ impl LayerManager {
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        layer: &Layer,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager<Layer>,
-    ) {
+        metrics: &TimelineMetrics,
+        mapping: &mut LayerFileManager,
+    ) -> anyhow::Result<()> {
        let desc = layer.layer_desc();
+        if !layer.is_remote_layer() {
+            layer.delete_resident_layer_file()?;
+            metrics.resident_physical_size_sub(desc.file_size);
+        }

        // TODO Removing from the bottom of the layer map is expensive.
        //      Maybe instead discard all layer map historic versions that
@@ -252,18 +273,21 @@ impl LayerManager {
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
        mapping.remove(layer);
-        layer.garbage_collect_on_drop();
+
+        Ok(())
    }

-    pub(crate) fn contains(&self, layer: &Layer) -> bool {
+    pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
        self.layer_fmgr.contains(layer)
    }
 }

-pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
+pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
+    HashMap<PersistentLayerKey, Arc<T>>,
+);

-impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
+impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
+    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<T> {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
@@ -273,14 +297,14 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
            .clone()
    }

-    pub(crate) fn insert(&mut self, layer: T) {
+    pub(crate) fn insert(&mut self, layer: Arc<T>) {
        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
        if present.is_some() && cfg!(debug_assertions) {
            panic!("overwriting a layer: {:?}", layer.layer_desc())
        }
    }

-    pub(crate) fn contains(&self, layer: &T) -> bool {
+    pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
        self.0.contains_key(&layer.layer_desc().key())
    }

@@ -288,7 +312,7 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
        Self(HashMap::new())
    }

-    pub(crate) fn remove(&mut self, layer: &T) {
+    pub(crate) fn remove(&mut self, layer: Arc<T>) {
        let present = self.0.remove(&layer.layer_desc().key());
        if present.is_none() && cfg!(debug_assertions) {
            panic!(
@@ -297,4 +321,39 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
            )
        }
    }
+
+    pub(crate) fn replace_and_verify(&mut self, expected: Arc<T>, new: Arc<T>) -> Result<()> {
+        let key = expected.layer_desc().key();
+        let other = new.layer_desc().key();
+
+        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
+        let new_l0 = LayerMap::is_l0(new.layer_desc());
+
+        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
+            "layermap-replace-notfound"
+        ));
+
+        anyhow::ensure!(
+            key == other,
+            "expected and new layer have different keys: {key:?} != {other:?}"
+        );
+
+        anyhow::ensure!(
+            expected_l0 == new_l0,
+            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
+        );
+
+        if let Some(layer) = self.0.get_mut(&key) {
+            anyhow::ensure!(
+                compare_arced_layers(&expected, layer),
+                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
+                expected = Arc::as_ptr(&expected),
+                new = Arc::as_ptr(layer),
+            );
+            *layer = new;
+            Ok(())
+        } else {
+            anyhow::bail!("layer was not found");
+        }
+    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -26,7 +26,8 @@ use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::{BrokerClientChannel, Code, Streaming};
+use storage_broker::BrokerClientChannel;
+use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

@@ -136,17 +137,8 @@ pub(super) async fn connection_manager_loop_step(
            broker_update = broker_subscription.message() => {
                match broker_update {
                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
-                    Err(status) => {
-                        match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
-                                // tonic's error handling doesn't provide a clear code for disconnections: we get
-                                // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
-                                info!("broker disconnected: {status}");
-                            },
-                            _ => {
-                                warn!("broker subscription failed: {status}");
-                            }
-                        }
+                    Err(e) => {
+                        error!("broker subscription failed: {e}");
                        return ControlFlow::Continue(());
                    }
                    Ok(None) => {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -122,7 +122,7 @@ pub(super) async fn handle_walreceiver_connection(
    // Connect to the database in replication mode.
    info!("connecting to {wal_source_connconf:?}");

-    let (replication_client, connection) = {
+    let (mut replication_client, connection) = {
        let mut config = wal_source_connconf.to_tokio_postgres_config();
        config.application_name("pageserver");
        config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
@@ -205,7 +205,7 @@ pub(super) async fn handle_walreceiver_connection(
        gauge.dec();
    }

-    let identify = identify_system(&replication_client).await?;
+    let identify = identify_system(&mut replication_client).await?;
    info!("{identify:?}");

    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
@@ -444,7 +444,7 @@ struct IdentifySystem {
 struct IdentifyError;

 /// Run the postgres `IDENTIFY_SYSTEM` command
-async fn identify_system(client: &Client) -> anyhow::Result<IdentifySystem> {
+async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem> {
    let query_str = "IDENTIFY_SYSTEM";
    let response = client.simple_query(query_str).await?;

@@ -459,7 +459,7 @@ async fn identify_system(client: &Client) -> anyhow::Result<IdentifySystem> {

    // extract the row contents into an IdentifySystem struct.
    // written as a closure so I can use ? for Option here.
-    if let Some(SimpleQueryMessage::Row(first_row)) = response.first() {
+    if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
        Ok(IdentifySystem {
            systemid: get_parse(first_row, 0)?,
            timeline: get_parse(first_row, 1)?,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,5 +1,4 @@
 use super::storage_layer::LayerFileName;
-use super::storage_layer::ResidentLayer;
 use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
@@ -80,14 +79,6 @@ pub(crate) struct UploadQueueInitialized {
    /// tasks to finish. For example, metadata upload cannot be performed before all
    /// preceding layer file uploads have completed.
    pub(crate) queued_operations: VecDeque<UploadOp>,
-
-    /// Files which have been unlinked but not yet had scheduled a deletion for. Only kept around
-    /// for error logging.
-    ///
-    /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
-    /// bug causing leaks, then it's better to not leave this enabled for production builds.
-    #[cfg(feature = "testing")]
-    pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
 }

 impl UploadQueueInitialized {
@@ -144,8 +135,6 @@ impl UploadQueue {
            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
-            #[cfg(feature = "testing")]
-            dangling_files: HashMap::new(),
        };

        *self = UploadQueue::Initialized(state);
@@ -191,8 +180,6 @@ impl UploadQueue {
            num_inprogress_deletions: 0,
            inprogress_tasks: HashMap::new(),
            queued_operations: VecDeque::new(),
-            #[cfg(feature = "testing")]
-            dangling_files: HashMap::new(),
        };

        *self = UploadQueue::Initialized(state);
@@ -216,6 +203,18 @@ impl UploadQueue {
            UploadQueue::Stopped(stopped) => Ok(stopped),
        }
    }
+
+    pub(crate) fn get_layer_metadata(
+        &self,
+        name: &LayerFileName,
+    ) -> anyhow::Result<Option<LayerFileMetadata>> {
+        match self {
+            UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Initialized(inner) => Ok(inner.latest_files.get(name).cloned()),
+        }
+    }
 }

 /// An in-progress upload or delete task.
@@ -238,7 +237,7 @@ pub(crate) struct Delete {
 #[derive(Debug)]
 pub(crate) enum UploadOp {
    /// Upload a layer file
-    UploadLayer(ResidentLayer, LayerFileMetadata),
+    UploadLayer(LayerFileName, LayerFileMetadata),

    /// Upload the metadata file
    UploadMetadata(IndexPart, Lsn),
@@ -253,13 +252,13 @@ pub(crate) enum UploadOp {
 impl std::fmt::Display for UploadOp {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
-            UploadOp::UploadLayer(layer, metadata) => {
+            UploadOp::UploadLayer(path, metadata) => {
                write!(
                    f,
                    "UploadLayer({}, size={:?}, gen={:?})",
-                    layer,
+                    path.file_name(),
                    metadata.file_size(),
-                    metadata.generation
+                    metadata.generation,
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,7 +19,6 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{RwLock, RwLockWriteGuard};
-use utils::fs_ext;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -174,78 +173,37 @@ impl OpenFiles {
    }
 }

-/// Identify error types that should alwways terminate the process.  Other
-/// error types may be elegible for retry.
-pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
-    use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
-        Some(EIO) => {
-            // Terminate on EIO because we no longer trust the device to store
-            // data safely, or to uphold persistence guarantees on fsync.
-            true
-        }
-        Some(EROFS) => {
-            // Terminate on EROFS because a filesystem is usually remounted
-            // readonly when it has experienced some critical issue, so the same
-            // logic as EIO applies.
-            true
-        }
-        Some(EACCES) => {
-            // Terminate on EACCESS because we should always have permissions
-            // for our own data dir: if we don't, then we can't do our job and
-            // need administrative intervention to fix permissions.  Terminating
-            // is the best way to make sure we stop cleanly rather than going
-            // into infinite retry loops, and will make it clear to the outside
-            // world that we need help.
-            true
-        }
-        _ => {
-            // Treat all other local file I/O errors are retryable.  This includes:
-            // - ENOSPC: we stay up and wait for eviction to free some space
-            // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
-            // - WriteZero, Interrupted: these are used internally VirtualFile
-            false
-        }
-    }
+#[derive(Debug, thiserror::Error)]
+pub enum CrashsafeOverwriteError {
+    #[error("final path has no parent dir")]
+    FinalPathHasNoParentDir,
+    #[error("remove tempfile")]
+    RemovePreviousTempfile(#[source] std::io::Error),
+    #[error("create tempfile")]
+    CreateTempfile(#[source] std::io::Error),
+    #[error("write tempfile")]
+    WriteContents(#[source] std::io::Error),
+    #[error("sync tempfile")]
+    SyncTempfile(#[source] std::io::Error),
+    #[error("rename tempfile to final path")]
+    RenameTempfileToFinalPath(#[source] std::io::Error),
+    #[error("open final path parent dir")]
+    OpenFinalPathParentDir(#[source] std::io::Error),
+    #[error("sync final path parent dir")]
+    SyncFinalPathParentDir(#[source] std::io::Error),
 }
-
-/// Call this when the local filesystem gives us an error with an external
-/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
-/// bad storage or bad configuration, and we can't fix that from inside
-/// a running process.
-pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
-    tracing::error!("Fatal I/O error: {e}: {context})");
-    std::process::abort();
-}
-
-pub(crate) trait MaybeFatalIo<T> {
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
-    fn fatal_err(self, context: &str) -> T;
-}
-
-impl<T> MaybeFatalIo<T> for std::io::Result<T> {
-    /// Terminate the process if the result is an error of a fatal type, else pass it through
-    ///
-    /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
-    /// not on ENOSPC.
-    fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
-        if let Err(e) = &self {
-            if is_fatal_io_error(e) {
-                on_fatal_io_error(e, context);
-            }
-        }
-        self
-    }
-
-    /// Terminate the process on any I/O error.
-    ///
-    /// This is appropriate for reads on files that we know exist: they should always work.
-    fn fatal_err(self, context: &str) -> T {
+impl CrashsafeOverwriteError {
+    /// Returns true iff the new contents are durably stored.
+    pub fn are_new_contents_durable(&self) -> bool {
        match self {
-            Ok(v) => v,
-            Err(e) => {
-                on_fatal_io_error(&e, context);
-            }
+            Self::FinalPathHasNoParentDir => false,
+            Self::RemovePreviousTempfile(_) => false,
+            Self::CreateTempfile(_) => false,
+            Self::WriteContents(_) => false,
+            Self::SyncTempfile(_) => false,
+            Self::RenameTempfileToFinalPath(_) => false,
+            Self::OpenFinalPathParentDir(_) => false,
+            Self::SyncFinalPathParentDir(_) => true,
        }
    }
 }
@@ -326,13 +284,15 @@ impl VirtualFile {
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
        content: &[u8],
-    ) -> std::io::Result<()> {
+    ) -> Result<(), CrashsafeOverwriteError> {
        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
+            return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
+        match std::fs::remove_file(tmp_path) {
+            Ok(()) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
+        }
        let mut file = Self::open_with_options(
            tmp_path,
            OpenOptions::new()
@@ -341,20 +301,31 @@ impl VirtualFile {
                // we bail out instead of causing damage.
                .create_new(true),
        )
-        .await?;
-        file.write_all(content).await?;
-        file.sync_all().await?;
+        .await
+        .map_err(CrashsafeOverwriteError::CreateTempfile)?;
+        file.write_all(content)
+            .await
+            .map_err(CrashsafeOverwriteError::WriteContents)?;
+        file.sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncTempfile)?;
        drop(file); // before the rename, that's important!
                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
+        std::fs::rename(tmp_path, final_path)
+            .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
        // Only open final path parent dirfd now, so that this operation only
        // ever holds one VirtualFile fd at a time.  That's important because
        // the current `find_victim_slot` impl might pick the same slot for both
        // VirtualFile., and it eventually does a blocking write lock instead of
        // try_lock.
        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
+            Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
+                .await
+                .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
+        final_parent_dirfd
+            .sync_all()
+            .await
+            .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
        Ok(())
    }

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -443,7 +443,7 @@ impl<'a> WalIngest<'a> {
        &mut self,
        buf: &mut Bytes,
        modification: &mut DatadirModification<'_>,
-        decoded: &DecodedWALRecord,
+        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
@@ -749,7 +749,7 @@ impl<'a> WalIngest<'a> {
        &mut self,
        buf: &mut Bytes,
        modification: &mut DatadirModification<'_>,
-        decoded: &DecodedWALRecord,
+        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        // Handle VM bit updates that are implicitly part of heap records.
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -857,8 +857,7 @@ impl WalRedoProcess {
            let in_revents = stdin_pollfds[0].revents().unwrap();
            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
+            } else if in_revents.contains(PollFlags::POLLHUP) {
                // We still have more data to write, but the process closed the pipe.
                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
            }
@@ -908,8 +907,7 @@ impl WalRedoProcess {
                let out_revents = stdout_pollfds[0].revents().unwrap();
                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
+                } else if out_revents.contains(PollFlags::POLLHUP) {
                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
                }
            }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,7 +19,6 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
-#include "c.h"

 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -64,21 +63,6 @@ int			max_reconnect_attempts = 60;
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);
-static void pageserver_disconnect(void);
-
-
-static pqsigfunc	 prev_signal_handler;
-
-static void
-pageserver_sighup_handler(SIGNAL_ARGS)
-{
-	if (prev_signal_handler)
-	{
-        	prev_signal_handler(postgres_signal_arg);
-	}
-	neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
-	pageserver_disconnect();
-}

 static bool
 pageserver_connect(int elevel)
@@ -416,7 +400,7 @@ pg_init_libpagestore(void)
 							   NULL,
 							   &page_server_connstring,
 							   "",
-							   PGC_SIGHUP,
+							   PGC_POSTMASTER,
 							   0,	/* no flags required */
 							   NULL, NULL, NULL);

@@ -498,8 +482,5 @@ pg_init_libpagestore(void)
 		old_redo_read_buffer_filter = redo_read_buffer_filter;
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
-
-        prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
-
 	lfc_init();
 }
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -88,7 +88,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
 static void WalSndLoop(WalProposer *wp);
 static void XLogBroadcastWalProposer(WalProposer *wp);

-static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
+static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);

 static void
@@ -1241,7 +1241,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
 				rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;

 				/* write WAL to disk */
-				XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
+				XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);

 				ereport(DEBUG1,
 						(errmsg("Recover message %X/%X length %d",
@@ -1283,24 +1283,11 @@ static XLogSegNo walpropSegNo = 0;
 * Write XLOG data to disk.
 */
 static void
-XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
+XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
 {
 	int			startoff;
 	int			byteswritten;

-	/*
-	 * Apart from walproposer, basebackup LSN page is also written out by
-	 * postgres itself which writes WAL only in pages, and in basebackup it is
-	 * inherently dummy (only safekeepers have historic WAL). Update WAL buffers
-	 * here to avoid dummy page overwriting correct one we download here. Ugly,
-	 * but alternatives are about the same ugly. We won't need that if we switch
-	 * to on-demand WAL download from safekeepers, without writing to disk.
-	 *
-	 * https://github.com/neondatabase/neon/issues/5749
-	 */
-	if (!wp->config->syncSafekeepers)
-		XLogUpdateWalBuffers(buf, recptr, nbytes);
-
 	while (nbytes > 0)
 	{
 		int			segbytes;
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -201,16 +201,6 @@ WalRedoMain(int argc, char *argv[])
 #endif

 	am_wal_redo_postgres = true;
-	/*
-	 * Pageserver treats any output to stderr as an ERROR, so we must
-	 * set the log level as early as possible to only log FATAL and 
-	 * above during WAL redo (note that loglevel ERROR also logs LOG,
-	 * which is super strange but that's not something we can solve
-	 * for here. ¯\_(-_-)_/¯
-	 */
-	SetConfigOption("log_min_messages", "FATAL", PGC_SUSET, PGC_S_OVERRIDE);
-	SetConfigOption("client_min_messages", "ERROR", PGC_SUSET,
-					PGC_S_OVERRIDE);

 	/*
 	 * WAL redo does not need a large number of buffers. And speed of
@@ -895,12 +885,7 @@ apply_error_callback(void *arg)
 	StringInfoData buf;

 	initStringInfo(&buf);
-#if PG_VERSION_NUM >= 150000
-	if (record->record)
-#else
-	if (record->decoded_record)
-#endif
-		xlog_outdesc(&buf, record);
+	xlog_outdesc(&buf, record);

 	/* translator: %s is a WAL record description */
 	errcontext("WAL redo at %X/%X for %s",
--- a/poetry.lock
+++ b/poetry.lock
@@ -2447,20 +2447,20 @@ test = ["websockets"]

 [[package]]
 name = "werkzeug"
-version = "3.0.1"
+version = "2.2.3"
 description = "The comprehensive WSGI web application library."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
-    {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
+    {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
+    {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
 ]

 [package.dependencies]
 MarkupSafe = ">=2.1.1"

 [package.extras]
-watchdog = ["watchdog (>=2.3)"]
+watchdog = ["watchdog"]

 [[package]]
 name = "wrapt"
@@ -2719,4 +2719,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "74649cf47c52f21b01b096a42044750b1c9677576b405be0489c2909127a9bf1"
+content-hash = "c5981d8d7c2deadd47c823bc35f86f830c8e320b653d2d3718bade1f4d2dabca"
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,9 +3,7 @@ mod hacks;
 mod link;

 pub use link::LinkAuthError;
-use tokio_postgres::config::AuthKeys;

-use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
 use crate::{
    auth::{self, ClientCredentials},
    config::AuthenticationConfig,
@@ -18,9 +16,8 @@ use crate::{
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
-use std::ops::ControlFlow;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, warn};
+use tracing::info;

 /// A product of successful authentication.
 pub struct AuthSuccess<T> {
@@ -120,28 +117,22 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
    }
 }

-pub enum ComputeCredentials {
-    Password(Vec<u8>),
-    AuthKeys(AuthKeys),
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
-async fn auth_quirks_creds(
+async fn auth_quirks(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
    if creds.project.is_none() {
        // Password will be checked by the compute node later.
-        return hacks::password_hack(creds, client, latency_timer).await;
+        return hacks::password_hack(api, extra, creds, client).await;
    }

    // Password hack should set the project name.
@@ -152,63 +143,11 @@ async fn auth_quirks_creds(
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
        // Password will be checked by the compute node later.
-        return hacks::cleartext_hack(client, latency_timer).await;
+        return hacks::cleartext_hack(api, extra, creds, client).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(api, extra, creds, client, config, latency_timer).await
-}
-
-/// True to its name, this function encapsulates our current auth trade-offs.
-/// Here, we choose the appropriate auth flow based on circumstances.
-async fn auth_quirks(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
-    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    allow_cleartext: bool,
-    config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
-    let auth_stuff = auth_quirks_creds(
-        api,
-        extra,
-        creds,
-        client,
-        allow_cleartext,
-        config,
-        latency_timer,
-    )
-    .await?;
-
-    let mut num_retries = 0;
-    let mut node = loop {
-        let wake_res = api.wake_compute(extra, creds).await;
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            Ok(ControlFlow::Break(n)) => break n,
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-        tokio::time::sleep(wait_duration).await;
-    };
-
-    match auth_stuff.value {
-        ComputeCredentials::Password(password) => node.config.password(password),
-        ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
-    };
-
-    Ok(AuthSuccess {
-        reported_auth_ok: auth_stuff.reported_auth_ok,
-        value: node,
-    })
+    classic::authenticate(api, extra, creds, client, config).await
 }

 impl BackendType<'_, ClientCredentials<'_>> {
@@ -244,7 +183,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
-        latency_timer: &mut LatencyTimer,
    ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
        use BackendType::*;

@@ -257,16 +195,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(
-                    api,
-                    extra,
-                    creds,
-                    client,
-                    allow_cleartext,
-                    config,
-                    latency_timer,
-                )
-                .await?
+                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
            }
            Postgres(api, creds) => {
                info!(
@@ -276,16 +205,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
                );

                let api = api.as_ref();
-                auth_quirks(
-                    api,
-                    extra,
-                    creds,
-                    client,
-                    allow_cleartext,
-                    config,
-                    latency_timer,
-                )
-                .await?
+                auth_quirks(api, extra, creds, client, allow_cleartext, config).await?
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,15 +1,17 @@
-use super::{AuthSuccess, ComputeCredentials};
+use std::ops::ControlFlow;
+
+use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    config::AuthenticationConfig,
-    console::{self, AuthInfo, ConsoleReqExtra},
-    proxy::LatencyTimer,
+    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
+    proxy::{handle_try_wake, retry_after},
    sasl, scram,
    stream::PqStream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::{error, info, warn};

 pub(super) async fn authenticate(
    api: &impl console::Api,
@@ -17,8 +19,7 @@ pub(super) async fn authenticate(
    creds: &ClientCredentials<'_>,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
    config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    info!("fetching user's authentication info");
    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
        // If we don't have an authentication secret, we mock one to
@@ -38,26 +39,24 @@ pub(super) async fn authenticate(
            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);

+            let auth_flow = flow.begin(scram).await.map_err(|error| {
+                warn!(?error, "error sending scram acknowledgement");
+                error
+            })?;
+
            let auth_outcome = tokio::time::timeout(
                config.scram_protocol_timeout,
-                async {
-                    // pause the timer while we communicate with the client
-                    let _paused = latency_timer.pause();
-
-                    flow.begin(scram).await.map_err(|error| {
-                        warn!(?error, "error sending scram acknowledgement");
-                        error
-                    })?.authenticate().await.map_err(|error| {
-                        warn!(?error, "error processing scram messages");
-                        error
-                    })
-                }
+                auth_flow.authenticate(),
            )
            .await
            .map_err(|error| {
                warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
                auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
-            })??;
+            })?
+            .map_err(|error| {
+                warn!(?error, "error processing scram messages");
+                error
+            })?;

            let client_key = match auth_outcome {
                sasl::Outcome::Success(key) => key,
@@ -67,17 +66,38 @@ pub(super) async fn authenticate(
                }
            };

-            compute::ScramKeys {
+            Some(compute::ScramKeys {
                client_key: client_key.as_bytes(),
                server_key: secret.server_key.as_bytes(),
-            }
+            })
        }
    };

+    let mut num_retries = 0;
+    let mut node = loop {
+        let wake_res = api.wake_compute(extra, creds).await;
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                return Err(e.into());
+            }
+            Ok(ControlFlow::Continue(e)) => {
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
+            Ok(ControlFlow::Break(n)) => break n,
+        }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
+    };
+    if let Some(keys) = scram_keys {
+        use tokio_postgres::config::AuthKeys;
+        node.config.auth_keys(AuthKeys::ScramSha256(keys));
+    }
+
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
-            scram_keys,
-        )),
+        value: node,
    })
 }
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,7 +1,10 @@
-use super::{AuthSuccess, ComputeCredentials};
+use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
-    proxy::LatencyTimer,
+    console::{
+        self,
+        provider::{CachedNodeInfo, ConsoleReqExtra},
+    },
    stream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -12,39 +15,37 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn cleartext_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    warn!("cleartext auth flow override is enabled, proceeding");
-
-    // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
-
    let password = AuthFlow::new(client)
        .begin(auth::CleartextPassword)
        .await?
        .authenticate()
        .await?;

+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(password);
+
    // Report tentative success; compute node will check the password anyway.
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: ComputeCredentials::Password(password),
+        value: node,
    })
 }

 /// Workaround for clients which don't provide an endpoint (project) name.
 /// Very similar to [`cleartext_hack`], but there's a specific password format.
 pub async fn password_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    latency_timer: &mut LatencyTimer,
-) -> auth::Result<AuthSuccess<ComputeCredentials>> {
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    warn!("project not specified, resorting to the password hack auth flow");
-
-    // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
-
    let payload = AuthFlow::new(client)
        .begin(auth::PasswordHack)
        .await?
@@ -54,9 +55,12 @@ pub async fn password_hack(
    info!(project = &payload.endpoint, "received missing parameter");
    creds.project = Some(payload.endpoint);

+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(payload.password);
+
    // Report tentative success; compute node will check the password anyway.
    Ok(AuthSuccess {
        reported_auth_ok: false,
-        value: ComputeCredentials::Password(payload.password),
+        value: node,
    })
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -4,11 +4,10 @@ use proxy::config::AuthenticationConfig;
 use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
-use proxy::usage_metrics;
+use proxy::metrics;

 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
-use proxy::serverless;
 use std::pin::pin;
 use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
@@ -16,10 +15,9 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
-use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+use utils::{project_git_version, sentry_init::init_sentry};

 project_git_version!(GIT_VERSION);
-project_build_tag!(BUILD_TAG);

 use clap::{Parser, ValueEnum};

@@ -101,8 +99,7 @@ async fn main() -> anyhow::Result<()> {
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

    info!("Version: {GIT_VERSION}");
-    info!("Build_tag: {BUILD_TAG}");
-    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    ::metrics::set_build_info_metric(GIT_VERSION);

    let args = ProxyCliArgs::parse();
    let config = build_config(&args)?;
@@ -132,16 +129,14 @@ async fn main() -> anyhow::Result<()> {
        cancellation_token.clone(),
    ));

-    // TODO: rename the argument to something like serverless.
-    // It now covers more than just websockets, it also covers SQL over HTTP.
-    if let Some(serverless_address) = args.wss {
-        let serverless_address: SocketAddr = serverless_address.parse()?;
-        info!("Starting wss on {serverless_address}");
-        let serverless_listener = TcpListener::bind(serverless_address).await?;
+    if let Some(wss_address) = args.wss {
+        let wss_address: SocketAddr = wss_address.parse()?;
+        info!("Starting wss on {wss_address}");
+        let wss_listener = TcpListener::bind(wss_address).await?;

-        client_tasks.spawn(serverless::task_main(
+        client_tasks.spawn(http::websocket::task_main(
            config,
-            serverless_listener,
+            wss_listener,
            cancellation_token.clone(),
        ));
    }
@@ -149,11 +144,11 @@ async fn main() -> anyhow::Result<()> {
    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
-    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
+    maintenance_tasks.spawn(http::server::task_main(http_listener));
    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));

    if let Some(metrics_config) = &config.metric_collection {
-        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
+        maintenance_tasks.spawn(metrics::task_main(metrics_config));
    }

    let maintenance = loop {
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -223,7 +223,7 @@ pub struct CacheOptions {

 impl CacheOptions {
    /// Default options for [`crate::console::provider::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m";
+    pub const DEFAULT_OPTIONS_NODE_INFO: &str = "size=4000,ttl=4m";

    /// Parse cache options passed via cmdline.
    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -13,7 +13,6 @@ pub struct ConsoleError {
 #[derive(Deserialize)]
 pub struct GetRoleSecret {
    pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<Box<str>>>,
 }

 // Manually implement debug to omit sensitive info.
@@ -188,31 +187,4 @@ mod tests {

        Ok(())
    }
-
-    #[test]
-    fn parse_wake_compute() -> anyhow::Result<()> {
-        let json = json!({
-            "address": "0.0.0.0",
-            "aux": dummy_aux(),
-        });
-        let _: WakeCompute = serde_json::from_str(&json.to_string())?;
-        Ok(())
-    }
-
-    #[test]
-    fn parse_get_role_secret() -> anyhow::Result<()> {
-        // Empty `allowed_ips` field.
-        let json = json!({
-            "role_secret": "secret",
-        });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
-        // Empty `allowed_ips` field.
-        let json = json!({
-            "role_secret": "secret",
-            "allowed_ips": ["8.8.8.8"],
-        });
-        let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
-
-        Ok(())
-    }
 }
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -90,13 +90,11 @@ pub mod errors {
                    status: http::StatusCode::LOCKED,
                    ref text,
                } => {
-                    // written data quota exceeded
-                    // data transfer quota exceeded
-                    // compute time quota exceeded
-                    // logical size quota exceeded
-                    !text.contains("quota exceeded")
+                    !text.contains("written data quota exceeded")
                        && !text.contains("the limit for current plan reached")
                }
+                // retry server errors
+                Self::Console { status, .. } if status.is_server_error() => true,
                _ => false,
            }
        }
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -59,7 +59,7 @@ impl Api {
            let rows = client.query(query, &[&creds.user]).await?;

            // We can get at most one row, because `rolname` is unique.
-            let row = match rows.first() {
+            let row = match rows.get(0) {
                Some(row) => row,
                // This means that the user doesn't exist, so there can be no secret.
                // However, this is still a *valid* outcome which is very similar
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -49,7 +49,7 @@ impl Api {
                .endpoint
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
@@ -94,7 +94,7 @@ impl Api {
                .endpoint
                .get("proxy_wake_compute")
                .header("X-Request-ID", &request_id)
-                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .header("Authorization", &self.jwt)
                .query(&[("session_id", extra.session_id)])
                .query(&[
                    ("application_name", extra.application_name),
--- a/Show More
+++ b/Show More